Re: [PATCH RFC v3 22/35] arm64: mte: Enable tag storage if CMA areas have been activated

2024-02-05 Thread Alexandru Elisei
Hi Evgenii,

On Fri, Feb 02, 2024 at 02:30:00PM -0800, Evgenii Stepanov wrote:
> On Thu, Jan 25, 2024 at 8:44 AM Alexandru Elisei
>  wrote:
> >
> > Before enabling MTE tag storage management, make sure that the CMA areas
> > have been successfully activated. If a CMA area fails activation, the pages
> > are kept as reserved. Reserved pages are never used by the page allocator.
> >
> > If this happens, the kernel would have to manage tag storage only for some
> > of the memory, but not for all memory, and that would make the code
> > unreasonably complicated.
> >
> > Choose to disable tag storage management altogether if a CMA area fails to
> > be activated.
> >
> > Signed-off-by: Alexandru Elisei 
> > ---
> >
> > Changes since v2:
> >
> > * New patch.
> >
> >  arch/arm64/include/asm/mte_tag_storage.h | 12 ++
> >  arch/arm64/kernel/mte_tag_storage.c  | 50 
> >  2 files changed, 62 insertions(+)
> >
> > diff --git a/arch/arm64/include/asm/mte_tag_storage.h 
> > b/arch/arm64/include/asm/mte_tag_storage.h
> > index 3c2cd29e053e..7b3f6bff8e6f 100644
> > --- a/arch/arm64/include/asm/mte_tag_storage.h
> > +++ b/arch/arm64/include/asm/mte_tag_storage.h
> > @@ -6,8 +6,20 @@
> >  #define __ASM_MTE_TAG_STORAGE_H
> >
> >  #ifdef CONFIG_ARM64_MTE_TAG_STORAGE
> > +
> > +DECLARE_STATIC_KEY_FALSE(tag_storage_enabled_key);
> > +
> > +static inline bool tag_storage_enabled(void)
> > +{
> > +   return static_branch_likely(_storage_enabled_key);
> > +}
> > +
> >  void mte_init_tag_storage(void);
> >  #else
> > +static inline bool tag_storage_enabled(void)
> > +{
> > +   return false;
> > +}
> >  static inline void mte_init_tag_storage(void)
> >  {
> >  }
> > diff --git a/arch/arm64/kernel/mte_tag_storage.c 
> > b/arch/arm64/kernel/mte_tag_storage.c
> > index 9a1a8a45171e..d58c68b4a849 100644
> > --- a/arch/arm64/kernel/mte_tag_storage.c
> > +++ b/arch/arm64/kernel/mte_tag_storage.c
> > @@ -19,6 +19,8 @@
> >
> >  #include 
> >
> > +__ro_after_init DEFINE_STATIC_KEY_FALSE(tag_storage_enabled_key);
> > +
> >  struct tag_region {
> > struct range mem_range; /* Memory associated with the tag storage, 
> > in PFNs. */
> > struct range tag_range; /* Tag storage memory, in PFNs. */
> > @@ -314,3 +316,51 @@ void __init mte_init_tag_storage(void)
> > num_tag_regions = 0;
> > pr_info("MTE tag storage region management disabled");
> >  }
> > +
> > +static int __init mte_enable_tag_storage(void)
> > +{
> > +   struct range *tag_range;
> > +   struct cma *cma;
> > +   int i, ret;
> > +
> > +   if (num_tag_regions == 0)
> > +   return 0;
> > +
> > +   for (i = 0; i < num_tag_regions; i++) {
> > +   tag_range = _regions[i].tag_range;
> > +   cma = tag_regions[i].cma;
> > +   /*
> > +* CMA will keep the pages as reserved when the region fails
> > +* activation.
> > +*/
> > +   if (PageReserved(pfn_to_page(tag_range->start)))
> > +   goto out_disabled;
> > +   }
> > +
> > +   static_branch_enable(_storage_enabled_key);
> > +   pr_info("MTE tag storage region management enabled");
> > +
> > +   return 0;
> > +
> > +out_disabled:
> > +   for (i = 0; i < num_tag_regions; i++) {
> > +   tag_range = _regions[i].tag_range;
> > +   cma = tag_regions[i].cma;
> > +
> > +   if (PageReserved(pfn_to_page(tag_range->start)))
> > +   continue;
> > +
> > +   /* Try really hard to reserve the tag storage. */
> > +   ret = cma_alloc(cma, range_len(tag_range), 8, true);
> > +   /*
> > +* Tag storage is still in use for data, memory and/or tag
> > +* corruption will ensue.
> > +*/
> > +   WARN_ON_ONCE(ret);
> 
> cma_alloc returns (page *), so this condition needs to be inverted,
> and the type of `ret` changed.
> Not sure how it slipped through, this is a compile error with clang.

Checked just now, it's a warning with gcc, I must have missed it. Will fix.

Thanks,
Alex

> 
> > +   }
> > +   num_tag_regions = 0;
> > +   pr_info("MTE tag storage region management disabled");
> > +
> > +   return -EINVAL;
> > +}
> > +arch_initcall(mte_enable_tag_storage);
> > --
> > 2.43.0
> >



Re: [PATCH RFC v3 28/35] arm64: mte: swap: Handle tag restoring when missing tag storage

2024-02-02 Thread Alexandru Elisei
Hi Peter,

On Thu, Feb 01, 2024 at 08:02:40PM -0800, Peter Collingbourne wrote:
> On Thu, Jan 25, 2024 at 8:45 AM Alexandru Elisei
>  wrote:
> >
> > Linux restores tags when a page is swapped in and there are tags associated
> > with the swap entry which the new page will replace. The saved tags are
> > restored even if the page will not be mapped as tagged, to protect against
> > cases where the page is shared between different VMAs, and is tagged in
> > some, but untagged in others. By using this approach, the process can still
> > access the correct tags following an mprotect(PROT_MTE) on the non-MTE
> > enabled VMA.
> >
> > But this poses a challenge for managing tag storage: in the scenario above,
> > when a new page is allocated to be swapped in for the process where it will
> > be mapped as untagged, the corresponding tag storage block is not reserved.
> > mte_restore_page_tags_by_swp_entry(), when it restores the saved tags, will
> > overwrite data in the tag storage block associated with the new page,
> > leading to data corruption if the block is in use by a process.
> >
> > Get around this issue by saving the tags in a new xarray, this time indexed
> > by the page pfn, and then restoring them when tag storage is reserved for
> > the page.
> >
> > Signed-off-by: Alexandru Elisei 
> > ---
> >
> > Changes since rfc v2:
> >
> > * Restore saved tags **before** setting the PG_tag_storage_reserved bit to
> > eliminate a brief window of opportunity where userspace can access 
> > uninitialized
> > tags (Peter Collingbourne).
> >
> >  arch/arm64/include/asm/mte_tag_storage.h |   8 ++
> >  arch/arm64/include/asm/pgtable.h |  11 +++
> >  arch/arm64/kernel/mte_tag_storage.c  |  12 ++-
> >  arch/arm64/mm/mteswap.c  | 110 +++
> >  4 files changed, 140 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/arm64/include/asm/mte_tag_storage.h 
> > b/arch/arm64/include/asm/mte_tag_storage.h
> > index 50bdae94cf71..40590a8c3748 100644
> > --- a/arch/arm64/include/asm/mte_tag_storage.h
> > +++ b/arch/arm64/include/asm/mte_tag_storage.h
> > @@ -36,6 +36,14 @@ bool page_is_tag_storage(struct page *page);
> >
> >  vm_fault_t handle_folio_missing_tag_storage(struct folio *folio, struct 
> > vm_fault *vmf,
> > bool *map_pte);
> > +vm_fault_t mte_try_transfer_swap_tags(swp_entry_t entry, struct page 
> > *page);
> > +
> > +void tags_by_pfn_lock(void);
> > +void tags_by_pfn_unlock(void);
> > +
> > +void *mte_erase_tags_for_pfn(unsigned long pfn);
> > +bool mte_save_tags_for_pfn(void *tags, unsigned long pfn);
> > +void mte_restore_tags_for_pfn(unsigned long start_pfn, int order);
> >  #else
> >  static inline bool tag_storage_enabled(void)
> >  {
> > diff --git a/arch/arm64/include/asm/pgtable.h 
> > b/arch/arm64/include/asm/pgtable.h
> > index 0174e292f890..87ae59436162 100644
> > --- a/arch/arm64/include/asm/pgtable.h
> > +++ b/arch/arm64/include/asm/pgtable.h
> > @@ -1085,6 +1085,17 @@ static inline void arch_swap_invalidate_area(int 
> > type)
> > mte_invalidate_tags_area_by_swp_entry(type);
> >  }
> >
> > +#ifdef CONFIG_ARM64_MTE_TAG_STORAGE
> > +#define __HAVE_ARCH_SWAP_PREPARE_TO_RESTORE
> > +static inline vm_fault_t arch_swap_prepare_to_restore(swp_entry_t entry,
> > + struct folio *folio)
> > +{
> > +   if (tag_storage_enabled())
> > +   return mte_try_transfer_swap_tags(entry, >page);
> > +   return 0;
> > +}
> > +#endif
> > +
> >  #define __HAVE_ARCH_SWAP_RESTORE
> >  static inline void arch_swap_restore(swp_entry_t entry, struct folio 
> > *folio)
> >  {
> > diff --git a/arch/arm64/kernel/mte_tag_storage.c 
> > b/arch/arm64/kernel/mte_tag_storage.c
> > index afe2bb754879..ac7b9c9c585c 100644
> > --- a/arch/arm64/kernel/mte_tag_storage.c
> > +++ b/arch/arm64/kernel/mte_tag_storage.c
> > @@ -567,6 +567,7 @@ int reserve_tag_storage(struct page *page, int order, 
> > gfp_t gfp)
> > }
> > }
> >
> > +   mte_restore_tags_for_pfn(page_to_pfn(page), order);
> > page_set_tag_storage_reserved(page, order);
> >  out_unlock:
> > mutex_unlock(_blocks_lock);
> > @@ -595,7 +596,8 @@ void free_tag_storage(struct page *page, int order)
> > struct tag_region *region;
> > unsigned long p

Re: [PATCH RFC v3 31/35] khugepaged: arm64: Don't collapse MTE enabled VMAs

2024-02-01 Thread Alexandru Elisei
On Thu, Feb 01, 2024 at 01:42:08PM +0530, Anshuman Khandual wrote:
> 
> 
> On 1/25/24 22:12, Alexandru Elisei wrote:
> > copy_user_highpage() will do memory allocation if there are saved tags for
> > the destination page, and the page is missing tag storage.
> > 
> > After commit a349d72fd9ef ("mm/pgtable: add rcu_read_lock() and
> > rcu_read_unlock()s"), collapse_huge_page() calls
> > __collapse_huge_page_copy() -> .. -> copy_user_highpage() with the RCU lock
> > held, which means that copy_user_highpage() can only allocate memory using
> > GFP_ATOMIC or equivalent.
> > 
> > Get around this by refusing to collapse pages into a transparent huge page
> > if the VMA is MTE-enabled.
> 
> Makes sense when copy_user_highpage() will allocate memory for tag storage.
> 
> > 
> > Signed-off-by: Alexandru Elisei 
> > ---
> > 
> > Changes since rfc v2:
> > 
> > * New patch. I think an agreement on whether copy*_user_highpage() should be
> > always allowed to sleep, or should not be allowed, would be useful.
> 
> This is a good question ! Even after preventing the collapse of MTE VMA here,
> there still might be more paths where a sleeping (i.e memory allocating)
> copy*_user_highpage() becomes problematic ?

Exactly!

> 
> > 
> >  arch/arm64/include/asm/pgtable.h| 3 +++
> >  arch/arm64/kernel/mte_tag_storage.c | 5 +
> >  include/linux/khugepaged.h  | 5 +
> >  mm/khugepaged.c | 4 
> >  4 files changed, 17 insertions(+)
> > 
> > diff --git a/arch/arm64/include/asm/pgtable.h 
> > b/arch/arm64/include/asm/pgtable.h
> > index 87ae59436162..d0473538c926 100644
> > --- a/arch/arm64/include/asm/pgtable.h
> > +++ b/arch/arm64/include/asm/pgtable.h
> > @@ -1120,6 +1120,9 @@ static inline bool arch_alloc_cma(gfp_t gfp_mask)
> > return true;
> >  }
> >  
> > +bool arch_hugepage_vma_revalidate(struct vm_area_struct *vma, unsigned 
> > long address);
> > +#define arch_hugepage_vma_revalidate arch_hugepage_vma_revalidate
> > +
> >  #endif /* CONFIG_ARM64_MTE_TAG_STORAGE */
> >  #endif /* CONFIG_ARM64_MTE */
> >  
> > diff --git a/arch/arm64/kernel/mte_tag_storage.c 
> > b/arch/arm64/kernel/mte_tag_storage.c
> > index ac7b9c9c585c..a99959b70573 100644
> > --- a/arch/arm64/kernel/mte_tag_storage.c
> > +++ b/arch/arm64/kernel/mte_tag_storage.c
> > @@ -636,3 +636,8 @@ void arch_alloc_page(struct page *page, int order, 
> > gfp_t gfp)
> > if (tag_storage_enabled() && alloc_requires_tag_storage(gfp))
> > reserve_tag_storage(page, order, gfp);
> >  }
> > +
> > +bool arch_hugepage_vma_revalidate(struct vm_area_struct *vma, unsigned 
> > long address)
> > +{
> > +   return !(vma->vm_flags & VM_MTE);
> > +}
> > diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
> > index f68865e19b0b..461e4322dff2 100644
> > --- a/include/linux/khugepaged.h
> > +++ b/include/linux/khugepaged.h
> > @@ -38,6 +38,11 @@ static inline void khugepaged_exit(struct mm_struct *mm)
> > if (test_bit(MMF_VM_HUGEPAGE, >flags))
> > __khugepaged_exit(mm);
> >  }
> > +
> > +#ifndef arch_hugepage_vma_revalidate
> > +#define arch_hugepage_vma_revalidate(vma, address) 1
> 
> Please replace s/1/true as arch_hugepage_vma_revalidate() returns bool ?

Yeah, that's strange, I don't know why I used 1 there. Will change it to true,
thanks for spotting it.

> 
> > +#endif
> 
> Right, above construct is much better than __HAVE_ARCH_ based one.

Thanks!

Alex

> 
> > +
> >  #else /* CONFIG_TRANSPARENT_HUGEPAGE */
> >  static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct 
> > *oldmm)
> >  {
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 2b219acb528e..cb9a9ddb4d86 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -935,6 +935,10 @@ static int hugepage_vma_revalidate(struct mm_struct 
> > *mm, unsigned long address,
> >  */
> > if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap)))
> > return SCAN_PAGE_ANON;
> > +
> > +   if (!arch_hugepage_vma_revalidate(vma, address))
> > +   return SCAN_VMA_CHECK;
> > +
> > return SCAN_SUCCEED;
> >  }
> >  
> 
> Otherwise this LGTM.



Re: [PATCH RFC v3 30/35] arm64: mte: ptrace: Handle pages with missing tag storage

2024-02-01 Thread Alexandru Elisei
Hi,

On Thu, Feb 01, 2024 at 02:51:39PM +0530, Anshuman Khandual wrote:
> 
> 
> On 1/25/24 22:12, Alexandru Elisei wrote:
> > A page can end up mapped in a MTE enabled VMA without the corresponding tag
> > storage block reserved. Tag accesses made by ptrace in this case can lead
> > to the wrong tags being read or memory corruption for the process that is
> > using the tag storage memory as data.
> > 
> > Reserve tag storage by treating ptrace accesses like a fault.
> > 
> > Signed-off-by: Alexandru Elisei 
> > ---
> > 
> > Changes since rfc v2:
> > 
> > * New patch, issue reported by Peter Collingbourne.
> > 
> >  arch/arm64/kernel/mte.c | 26 --
> >  1 file changed, 24 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
> > index faf09da3400a..b1fa02dad4fd 100644
> > --- a/arch/arm64/kernel/mte.c
> > +++ b/arch/arm64/kernel/mte.c
> > @@ -412,10 +412,13 @@ static int __access_remote_tags(struct mm_struct *mm, 
> > unsigned long addr,
> > while (len) {
> > struct vm_area_struct *vma;
> > unsigned long tags, offset;
> > +   unsigned int fault_flags;
> > +   struct page *page;
> > +   vm_fault_t ret;
> > void *maddr;
> > -   struct page *page = get_user_page_vma_remote(mm, addr,
> > -gup_flags, );
> >  
> > +get_page:
> > +   page = get_user_page_vma_remote(mm, addr, gup_flags, );
> 
> But if there is valid page returned here in the first GUP attempt, will there
> still be a subsequent handle_mm_fault() on the same vma and addr ?

Only if it's missing tag storage. If it's missing tag storage, the page has
been mapped as arch_fault_on_access_pte(), and
handle_mm_fault()->..->arch_handle_folio_fault_on_access() will either
reserve tag storage, or migrate it.

> 
> > if (IS_ERR(page)) {
> > err = PTR_ERR(page);
> > break;
> > @@ -433,6 +436,25 @@ static int __access_remote_tags(struct mm_struct *mm, 
> > unsigned long addr,
> > put_page(page);
> > break;
> > }
> > +
> > +   if (tag_storage_enabled() && !page_tag_storage_reserved(page)) {
> 
> Should not '!page' be checked here as well ?

I was under the impression that get_user_page_vma_remote() returns an error
pointer if gup couldn't pin the page.

Thanks,
Alex

> 
> > +   fault_flags = FAULT_FLAG_DEFAULT | \
> > + FAULT_FLAG_USER | \
> > + FAULT_FLAG_REMOTE | \
> > + FAULT_FLAG_ALLOW_RETRY | \
> > + FAULT_FLAG_RETRY_NOWAIT;
> > +   if (write)
> > +   fault_flags |= FAULT_FLAG_WRITE;
> > +
> > +   put_page(page);
> > +   ret = handle_mm_fault(vma, addr, fault_flags, NULL);
> > +   if (ret & VM_FAULT_ERROR) {
> > +   err = -EFAULT;
> > +   break;
> > +   }
> > +   goto get_page;
> > +   }
> > +
> > WARN_ON_ONCE(!page_mte_tagged(page));
> >  
> > /* limit access to the end of the page */



Re: [PATCH RFC v3 13/35] mm: memory: Introduce fault-on-access mechanism for pages

2024-02-01 Thread Alexandru Elisei
Hi,

On Thu, Feb 01, 2024 at 11:22:13AM +0530, Anshuman Khandual wrote:
> On 1/25/24 22:12, Alexandru Elisei wrote:
> > Introduce a mechanism that allows an architecture to trigger a page fault,
> > and add the infrastructure to handle that fault accordingly. To use make> 
> > use of this, an arch is expected to mark the table entry as PAGE_NONE (which
> > will cause a fault next time it is accessed) and to implement an
> > arch-specific method (like a software bit) for recognizing that the fault
> > needs to be handled by the arch code.
> > 
> > arm64 will use of this approach to reserve tag storage for pages which are
> > mapped in an MTE enabled VMA, but the storage needed to store tags isn't
> > reserved (for example, because of an mprotect(PROT_MTE) call on a VMA with
> > existing pages).
> 
> Just to summerize -
> 
> So platform will create NUMA balancing like page faults - via marking existing
> mappings with PAGE_NONE permission, when the subsequent fault happens identify
> such cases via a software bit in the page table entry and then route the fault
> to the platform code itself for special purpose page fault handling where page
> might come from some reserved areas instead.

Indeed. In the tag storage scenario, the page is page that will be mapped
as tagged, if it's missing tag storage, the tag storage needs to be
reserved before it can be mapped as tagged (and tags can be accessed).

> 
> Some questions
> 
> - How often PAGE_NONE is to be marked for applicable MTE VMA based mappings 
> 
>   - Is it periodic like NUMA balancing or just one time for tag storage

It's deterministic, and only for tag storage. It's done in
set_ptes()/__set_pte_at()->..->mte_sync_tags(), if the page is going to be
mapped as tagged, but is missing tag storage. See patch #26 ("arm64: mte:
Use fault-on-access to reserve missing tag storage") [1] for the code.

[1] 
https://lore.kernel.org/linux-arm-kernel/20240125164256.4147-27-alexandru.eli...@arm.com/

> 
> - How this is going to interact with NUMA balancing given both use PAGE_NONE
> 
>   - How to differentiate these mappings from standard pte_protnone()

The only place where the difference matters is in do_numa_page(), here
renamed to handle_pte_protnone(), and in the huge page equivalent.

Userspace can access tags only if set_ptes()/__set_pte_at() maps the pte
with the PT_NORMAL_TAGGED attribute, but those functions will always map
the page as arch_fault_on_access_pte() if it's missing tag storage. That
makes it impossible for the kernel to map it as tagged behind our back.

Unless you had other concerns.

Thanks,
Alex

> 
> > 
> > Signed-off-by: Alexandru Elisei 
> > ---
> > 
> > Changes since rfc v2:
> > 
> > * New patch. Split from patch #19 ("mm: mprotect: Introduce 
> > PAGE_FAULT_ON_ACCESS
> > for mprotect(PROT_MTE)") (David Hildenbrand).
> > 
> >  include/linux/huge_mm.h |  4 ++--
> >  include/linux/pgtable.h | 47 +++--
> >  mm/Kconfig  |  3 +++
> >  mm/huge_memory.c| 36 +
> >  mm/memory.c | 51 ++---
> >  5 files changed, 109 insertions(+), 32 deletions(-)
> > 
> > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > index 5adb86af35fc..4678a0a5e6a8 100644
> > --- a/include/linux/huge_mm.h
> > +++ b/include/linux/huge_mm.h
> > @@ -346,7 +346,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct 
> > *vma, unsigned long addr,
> >  struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long 
> > addr,
> > pud_t *pud, int flags, struct dev_pagemap **pgmap);
> >  
> > -vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf);
> > +vm_fault_t handle_huge_pmd_protnone(struct vm_fault *vmf);
> >  
> >  extern struct page *huge_zero_page;
> >  extern unsigned long huge_zero_pfn;
> > @@ -476,7 +476,7 @@ static inline spinlock_t *pud_trans_huge_lock(pud_t 
> > *pud,
> > return NULL;
> >  }
> >  
> > -static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
> > +static inline vm_fault_t handle_huge_pmd_protnone(struct vm_fault *vmf)
> >  {
> > return 0;
> >  }
> > diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> > index 2d0f04042f62..81a21be855a2 100644
> > --- a/include/linux/pgtable.h
> > +++ b/include/linux/pgtable.h
> > @@ -1455,7 +1455,7 @@ static inline int pud_trans_unstable(pud_t *pud)
> > return 0;
> >  }
> >  
> > -#ifndef CONFIG_NUMA_BALANCING
> > +#if !defined(CONFIG_NUMA_BALANCIN

Re: [PATCH RFC v3 12/35] mm: Call arch_swap_prepare_to_restore() before arch_swap_restore()

2024-02-01 Thread Alexandru Elisei
Hi,

On Thu, Feb 01, 2024 at 09:00:23AM +0530, Anshuman Khandual wrote:
> 
> 
> On 1/25/24 22:12, Alexandru Elisei wrote:
> > arm64 uses arch_swap_restore() to restore saved tags before the page is
> > swapped in and it's called in atomic context (with the ptl lock held).
> > 
> > Introduce arch_swap_prepare_to_restore() that will allow an architecture to
> > perform extra work during swap in and outside of a critical section.
> > This will be used by arm64 to allocate a buffer in memory where to
> > temporarily save tags if tag storage is not available for the page being
> > swapped in.
> 
> Just wondering if tag storage will always be unavailable for tagged pages
> being swapped in ? OR there are cases where allocation might not even be

In some (probably most) situations, tag storage will be available for the
page that will be swapped in. That's because either the page will have been
taken from the swap cache (which means it hasn't been freed, so it still
has tag storage reserved) or it has been allocated with vma_alloc_folio()
(when it's swapped back in in a VMA with VM_MTE set).

I've explained a scenario where tags will be restored for a page without
tag storage in patch #28 ("mte: swap: Handle tag restoring when missing tag
storage") [1]. Basically, it's because tagged pages can be mapped as tagged
in one VMA and untagged in another VMA; and swap tags are restored the
first time a page is swapped back in, even if it's swapped in a VMA with
MTE disabled.

[1] 
https://lore.kernel.org/linux-arm-kernel/20240125164256.4147-29-alexandru.eli...@arm.com/

> required ? This prepare phase needs to be outside the critical section -
> only because there might be memory allocations ?

Yes, exactly. See patch above :)

Thanks,
Alex



Re: [PATCH RFC v3 08/35] mm: cma: Introduce cma_alloc_range()

2024-01-31 Thread Alexandru Elisei
Hi,

On Wed, Jan 31, 2024 at 11:54:17AM +0530, Anshuman Khandual wrote:
> 
> 
> On 1/30/24 17:05, Alexandru Elisei wrote:
> > Hi,
> > 
> > On Tue, Jan 30, 2024 at 10:50:00AM +0530, Anshuman Khandual wrote:
> >>
> >> On 1/25/24 22:12, Alexandru Elisei wrote:
> >>> Today, cma_alloc() is used to allocate a contiguous memory region. The
> >>> function allows the caller to specify the number of pages to allocate, but
> >>> not the starting address. cma_alloc() will walk over the entire CMA region
> >>> trying to allocate the first available range of the specified size.
> >>>
> >>> Introduce cma_alloc_range(), which makes CMA more versatile by allowing 
> >>> the
> >>> caller to specify a particular range in the CMA region, defined by the
> >>> start pfn and the size.
> >>>
> >>> arm64 will make use of this function when tag storage management will be
> >>> implemented: cma_alloc_range() will be used to reserve the tag storage
> >>> associated with a tagged page.
> >> Basically, you would like to pass on a preferred start address and the
> >> allocation could just fail if a contig range is not available from such
> >> a starting address ?
> >>
> >> Then why not just change cma_alloc() to take a new argument 'start_pfn'.
> >> Why create a new but almost similar allocator ?
> > I tried doing that, and I gave up because:
> > 
> > - It made cma_alloc() even more complex and hard to follow.
> > 
> > - What value should 'start_pfn' be to tell cma_alloc() that it should be
> >   ignored? Or, to put it another way, what pfn number is invalid on **all**
> >   platforms that Linux supports?
> > 
> > I can give it another go if we can come up with an invalid value for
> > 'start_pfn'.
> 
> Something negative might work. How about -1/-1UL ? A quick search gives
> some instances such as ...
> 
> git grep "pfn == -1"
> 
> mm/mm_init.c:   if (*start_pfn == -1UL)
> mm/vmscan.c:if (pfn == -1)
> mm/vmscan.c:if (pfn == -1)
> mm/vmscan.c:if (pfn == -1)
> tools/testing/selftests/mm/hugepage-vmemmap.c:  if (pfn == -1UL) {
> 
> Could not -1UL be abstracted as common macro MM_INVALID_PFN to be used in
> such scenarios including here ?

Ah yes, you are right, get_pte_pfn() already uses -1 as an invalid pfn, so
I can just use that.

Will definitely give it a go on the next iteration, thanks for the
suggestion!

> 
> > 
> >> But then I am wondering why this could not be done in the arm64 platform
> >> code itself operating on a CMA area reserved just for tag storage. Unless
> >> this new allocator has other usage beyond MTE, this could be implemented
> >> in the platform itself.
> > I had the same idea in the previous iteration, David Hildenbrand suggested
> > this approach [1].
> > 
> > [1] 
> > https://lore.kernel.org/linux-fsdevel/2aafd53f-af1f-45f3-a08c-d11962254...@redhat.com/
> 
> There are two different cma_alloc() proposals here - including the next
> patch i.e mm: cma: Fast track allocating memory when the pages are free
> 
> 1) Augment cma_alloc() or add cma_alloc_range() with start_pfn parameter
> 2) Speed up cma_alloc() for small allocation requests when pages are free
> 
> The second one if separated out from this series could be considered on
> its own as it will help all existing cma_alloc() callers. The first one
> definitely needs an use case as provided in this series.

I understand, thanks for the input!

Alex



Re: [PATCH RFC v3 09/35] mm: cma: Introduce cma_remove_mem()

2024-01-31 Thread Alexandru Elisei
Hi,

On Wed, Jan 31, 2024 at 06:49:34PM +0530, Anshuman Khandual wrote:
> On 1/30/24 17:03, Alexandru Elisei wrote:
> > Hi,
> > 
> > I really appreciate the feedback you have given me so far. I believe the
> > commit message isn't clear enough and there has been a confusion.
> > 
> > A CMA user adds a CMA area to the cma_areas array with
> > cma_declare_contiguous_nid() or cma_init_reserved_mem().
> > init_cma_reserved_pageblock() then iterates over the array and activates
> > all cma areas.
> 
> Agreed.
> 
> > 
> > The function cma_remove_mem() is intended to be used to remove a cma area
> > from the cma_areas array **before** the area has been activated.
> 
> Understood.
> 
> > 
> > Usecase: a driver (in this case, the arm64 dynamic tag storage code)
> > manages several cma areas. The driver successfully adds the first area to
> > the cma_areas array. When the driver tries to adds the second area, the
> > function fails. Without cma_remove_mem(), the driver has no way to prevent
> > the first area from being freed to the page allocator. cma_remove_mem() is
> > about providing a means to do cleanup in case of error.
> > 
> > Does that make more sense now?
> 
> How to ensure that cma_remove_mem() should get called by the driver before
> core_initcall()---> cma_init_reserved_areas()---> cma_activate_area() chain
> happens. Else cma_remove_mem() will miss out to clear cma->count and given
> area will proceed to get activated like always.

The same way drivers today call cma_declare_contiguous_nid() and
cma_init_reserved_mem() before cma_init_reserved_areas(). For an example,
have a look at kernel/dma/contiguous.c:: rmem_cma_setup().

As for how the series uses cma_remove_mem(), have a look at patch #20
("arm64: mte: Add tag storage memory to CMA") [1], specifically this bit:

for (i = 0; i < num_tag_regions; i++) {
region = _regions[i];

// code removed for clarity

ret = cma_init_reserved_mem(PFN_PHYS(region->tag_range.start),
PFN_PHYS(range_len(>tag_range)),
order, NULL, >cma);
if (ret) {
for (j = 0; j < i; j++)
cma_remove_mem(>cma);
goto out_disabled;
}
}

// code removed for clarity

out_disabled:
num_tag_regions = 0;
pr_info("MTE tag storage region management disabled");

I'll try to walk you through it. The driver manages 2 cma regions.

cma_init_reserved_mem() succeeds for the first region.

cma_init_reserved_mem() fails for the second region.

As a result, the first region will be activated (pages will be placed on
the MIGRATE_CMA list), but the second region will not be activated.

The driver can function only when **all** cma regions have been
successfully activated.

Driver removes first region from CMA, so now no regions will be activated,
and probing fails.

In a more general sense, cma_remove_mem() is **not** about removing a
region that failed initialization or activation, it's about removing a cma
area that was added to cma_areas successfully, but the driver doesn't want
to activate anymore for whatever reason (it can be because of a probing
error totally unrelated to CMA).

Does it make more sense now? I hope that this example also answers the rest
of your questions.

[1] 
https://lore.kernel.org/linux-arm-kernel/20240125164256.4147-21-alexandru.eli...@arm.com/

Thanks,
Alex

> 
> > 
> > Ok Tue, Jan 30, 2024 at 11:20:56AM +0530, Anshuman Khandual wrote:
> >>
> >>
> >> On 1/25/24 22:12, Alexandru Elisei wrote:
> >>> Memory is added to CMA with cma_declare_contiguous_nid() and
> >>> cma_init_reserved_mem(). This memory is then put on the MIGRATE_CMA list 
> >>> in
> >>> cma_init_reserved_areas(), where the page allocator can make use of it.
> >>
> >> cma_declare_contiguous_nid() reserves memory in memblock and marks the
> > 
> > You forgot about about cma_init_reserved_mem() which does the same thing,
> > but yes, you are right.
> 
> Agreed, missed that. There are some direct cma_init_reserved_mem() calls as 
> well.
> 
> > 
> >> for subsequent CMA usage, where as cma_init_reserved_areas() activates
> >> these memory areas through init_cma_reserved_pageblock(). Standard page
> >> allocator only receives these memory via free_reserved_page() - only if
> > 
> > I don't think that's correct. init_cma_reserved_pageblock() clears the
> > PG_reserved page flag, sets the migratetype to MIGRATE_CMA and then frees
> > the page. After

Re: [PATCH RFC v3 06/35] mm: cma: Make CMA_ALLOC_SUCCESS/FAIL count the number of pages

2024-01-31 Thread Alexandru Elisei
Hi,

On Wed, Jan 31, 2024 at 10:10:05AM +0530, Anshuman Khandual wrote:
> 
> 
> On 1/30/24 17:28, Alexandru Elisei wrote:
> > Hi,
> > 
> > On Tue, Jan 30, 2024 at 10:22:11AM +0530, Anshuman Khandual wrote:
> >>
> >> On 1/29/24 17:21, Alexandru Elisei wrote:
> >>> Hi,
> >>>
> >>> On Mon, Jan 29, 2024 at 02:54:20PM +0530, Anshuman Khandual wrote:
> >>>>
> >>>> On 1/25/24 22:12, Alexandru Elisei wrote:
> >>>>> The CMA_ALLOC_SUCCESS, respectively CMA_ALLOC_FAIL, are increased by one
> >>>>> after each cma_alloc() function call. This is done even though 
> >>>>> cma_alloc()
> >>>>> can allocate an arbitrary number of CMA pages. When looking at
> >>>>> /proc/vmstat, the number of successful (or failed) cma_alloc() calls
> >>>>> doesn't tell much with regards to how many CMA pages were allocated via
> >>>>> cma_alloc() versus via the page allocator (regular allocation request or
> >>>>> PCP lists refill).
> >>>>>
> >>>>> This can also be rather confusing to a user who isn't familiar with the
> >>>>> code, since the unit of measurement for nr_free_cma is the number of 
> >>>>> pages,
> >>>>> but cma_alloc_success and cma_alloc_fail count the number of cma_alloc()
> >>>>> function calls.
> >>>>>
> >>>>> Let's make this consistent, and arguably more useful, by having
> >>>>> CMA_ALLOC_SUCCESS count the number of successfully allocated CMA pages, 
> >>>>> and
> >>>>> CMA_ALLOC_FAIL count the number of pages the cma_alloc() failed to
> >>>>> allocate.
> >>>>>
> >>>>> For users that wish to track the number of cma_alloc() calls, there are
> >>>>> tracepoints for that already implemented.
> >>>>>
> >>>>> Signed-off-by: Alexandru Elisei 
> >>>>> ---
> >>>>>  mm/cma.c | 4 ++--
> >>>>>  1 file changed, 2 insertions(+), 2 deletions(-)
> >>>>>
> >>>>> diff --git a/mm/cma.c b/mm/cma.c
> >>>>> index f49c95f8ee37..dbf7fe8cb1bd 100644
> >>>>> --- a/mm/cma.c
> >>>>> +++ b/mm/cma.c
> >>>>> @@ -517,10 +517,10 @@ struct page *cma_alloc(struct cma *cma, unsigned 
> >>>>> long count,
> >>>>> pr_debug("%s(): returned %p\n", __func__, page);
> >>>>>  out:
> >>>>> if (page) {
> >>>>> -   count_vm_event(CMA_ALLOC_SUCCESS);
> >>>>> +   count_vm_events(CMA_ALLOC_SUCCESS, count);
> >>>>> cma_sysfs_account_success_pages(cma, count);
> >>>>> } else {
> >>>>> -   count_vm_event(CMA_ALLOC_FAIL);
> >>>>> +   count_vm_events(CMA_ALLOC_FAIL, count);
> >>>>> if (cma)
> >>>>> cma_sysfs_account_fail_pages(cma, count);
> >>>>> }
> >>>> Without getting into the merits of this patch - which is actually trying 
> >>>> to do
> >>>> semantics change to /proc/vmstat, wondering how is this even related to 
> >>>> this
> >>>> particular series ? If required this could be debated on it's on 
> >>>> separately.
> >>> Having the number of CMA pages allocated and the number of CMA pages freed
> >>> allows someone to infer how many tagged pages are in use at a given time:
> >> That should not be done in CMA which is a generic multi purpose allocator.
> 
> > Ah, ok. Let me rephrase that: Having the number of CMA pages allocated, the
> > number of failed CMA page allocations and the number of freed CMA pages
> > allows someone to infer how many CMA pages are in use at a given time.
> > That's valuable information for software designers and system
> > administrators, as it allows them to tune the number of CMA pages available
> > in a system.
> > 
> > Or put another way: what would you consider to be more useful?  Knowing the
> > number of cma_alloc()/cma_release() calls, or knowing the number of pages
> > that cma_alloc()/cma_release() allocated or freed?
> 
> There is still value in knowing how many times cma_alloc() succeeded or failed
> regardless of the cumulative number pages involved over the time. Actually the
> count helps to understand how cma_alloc() performed overall as an allocator.
> 
> But on the cma_release() path there is no chances of failure apart from - just
> when the caller itself provides an wrong input. So there are no corresponding
> CMA_RELEASE_SUCCESS/CMA_RELEASE_FAIL vmstat counters in there - for a reason !
> 
> Coming back to CMA based pages being allocated and freed, there is already an
> interface via sysfs (CONFIG_CMA_SYSFS) which gets updated in cma_alloc() path
> via cma_sysfs_account_success_pages() and cma_sysfs_account_fail_pages().
> 
> #ls /sys/kernel/mm/cma/
> alloc_pages_fail alloc_pages_success
> 
> Why these counters could not meet your requirements ? Also 'struct cma' can
> be updated to add an element 'nr_pages_freed' to be tracked in cma_release(),
> providing free pages count as well.
> 
> There are additional debug fs based elements (CONFIG_CMA_DEBUGFS) available.
> 
> #ls /sys/kernel/debug/cma/
> alloc  base_pfn  bitmap  count  free  maxchunk  order_per_bit  used

Ok, I'll have a look at those, thank you for the suggestion.

Thanks,
Alex



Re: [PATCH RFC v3 11/35] mm: Allow an arch to hook into folio allocation when VMA is known

2024-01-31 Thread Alexandru Elisei
Hi,

On Wed, Jan 31, 2024 at 12:23:51PM +0530, Anshuman Khandual wrote:
> 
> 
> On 1/30/24 17:04, Alexandru Elisei wrote:
> > Hi,
> > 
> > On Tue, Jan 30, 2024 at 03:25:20PM +0530, Anshuman Khandual wrote:
> >>
> >> On 1/25/24 22:12, Alexandru Elisei wrote:
> >>> arm64 uses VM_HIGH_ARCH_0 and VM_HIGH_ARCH_1 for enabling MTE for a VMA.
> >>> When VM_HIGH_ARCH_0, which arm64 renames to VM_MTE, is set for a VMA, and
> >>> the gfp flag __GFP_ZERO is present, the __GFP_ZEROTAGS gfp flag also gets
> >>> set in vma_alloc_zeroed_movable_folio().
> >>>
> >>> Expand this to be more generic by adding an arch hook that modifes the gfp
> >>> flags for an allocation when the VMA is known.
> >>>
> >>> Note that __GFP_ZEROTAGS is ignored by the page allocator unless 
> >>> __GFP_ZERO
> >>> is also set; from that point of view, the current behaviour is unchanged,
> >>> even though the arm64 flag is set in more places.  When arm64 will have
> >>> support to reuse the tag storage for data allocation, the uses of the
> >>> __GFP_ZEROTAGS flag will be expanded to instruct the page allocator to try
> >>> to reserve the corresponding tag storage for the pages being allocated.
> >> Right but how will pushing __GFP_ZEROTAGS addition into gfp_t flags further
> >> down via a new arch call back i.e arch_calc_vma_gfp() while still 
> >> maintaining
> >> (vma->vm_flags & VM_MTE) conditionality improve the current scenario. 
> >> Because
> > I'm afraid I don't follow you.
> 
> I was just asking whether the overall scope of __GFP_ZEROTAGS flag is being
> increased to cover more core MM paths through this patch. I think you have
> already answered that below.
> 
> > 
> >> the page allocator could have still analyzed alloc flags for __GFP_ZEROTAGS
> >> for any additional stuff.
> >>
> >> OR this just adds some new core MM paths to get __GFP_ZEROTAGS which was 
> >> not
> >> the case earlier via this call back.
> > Before this patch: vma_alloc_zeroed_movable_folio() sets __GFP_ZEROTAGS.
> > After this patch: vma_alloc_folio() sets __GFP_ZEROTAGS.
> 
> Understood.
> 
> > 
> > This patch is about adding __GFP_ZEROTAGS for more callers.
> 
> Right, I guess that is the real motivation for this patch. But just wondering
> does this cover all possible anon fault paths for converting given vma_flag's
> VM_MTE flag into page alloc flag __GFP_ZEROTAGS ? Aren't there any other file
> besides (mm/shmem.c) which needs to be changed to include arch_calc_vma_gfp() 
> ?

My thoughts exactly. I went through most of the fault handling code, and
from the code I read, all the allocation were executed with
vma_alloc_folio() or by shmem.

That's not to say there's no scope for improvment, there definitely is, but
since having __GFP_ZEROTAGS isn't necessary for correctness (but it's very
useful for performance, since it can avoid a page fault and a page
migration) and this series is an RFC I settled on changing only the above,
since KVM support for dynamic tag storage also benefits from this change.

The series is very big already, I wanted to settle on an approach that is
acceptable for upstreaming before thinking too much about performance.

Thanks,
Alex



Re: [PATCH RFC v3 06/35] mm: cma: Make CMA_ALLOC_SUCCESS/FAIL count the number of pages

2024-01-30 Thread Alexandru Elisei
Hi,

On Tue, Jan 30, 2024 at 10:22:11AM +0530, Anshuman Khandual wrote:
> 
> 
> On 1/29/24 17:21, Alexandru Elisei wrote:
> > Hi,
> > 
> > On Mon, Jan 29, 2024 at 02:54:20PM +0530, Anshuman Khandual wrote:
> >>
> >>
> >> On 1/25/24 22:12, Alexandru Elisei wrote:
> >>> The CMA_ALLOC_SUCCESS, respectively CMA_ALLOC_FAIL, are increased by one
> >>> after each cma_alloc() function call. This is done even though cma_alloc()
> >>> can allocate an arbitrary number of CMA pages. When looking at
> >>> /proc/vmstat, the number of successful (or failed) cma_alloc() calls
> >>> doesn't tell much with regards to how many CMA pages were allocated via
> >>> cma_alloc() versus via the page allocator (regular allocation request or
> >>> PCP lists refill).
> >>>
> >>> This can also be rather confusing to a user who isn't familiar with the
> >>> code, since the unit of measurement for nr_free_cma is the number of 
> >>> pages,
> >>> but cma_alloc_success and cma_alloc_fail count the number of cma_alloc()
> >>> function calls.
> >>>
> >>> Let's make this consistent, and arguably more useful, by having
> >>> CMA_ALLOC_SUCCESS count the number of successfully allocated CMA pages, 
> >>> and
> >>> CMA_ALLOC_FAIL count the number of pages the cma_alloc() failed to
> >>> allocate.
> >>>
> >>> For users that wish to track the number of cma_alloc() calls, there are
> >>> tracepoints for that already implemented.
> >>>
> >>> Signed-off-by: Alexandru Elisei 
> >>> ---
> >>>  mm/cma.c | 4 ++--
> >>>  1 file changed, 2 insertions(+), 2 deletions(-)
> >>>
> >>> diff --git a/mm/cma.c b/mm/cma.c
> >>> index f49c95f8ee37..dbf7fe8cb1bd 100644
> >>> --- a/mm/cma.c
> >>> +++ b/mm/cma.c
> >>> @@ -517,10 +517,10 @@ struct page *cma_alloc(struct cma *cma, unsigned 
> >>> long count,
> >>>   pr_debug("%s(): returned %p\n", __func__, page);
> >>>  out:
> >>>   if (page) {
> >>> - count_vm_event(CMA_ALLOC_SUCCESS);
> >>> + count_vm_events(CMA_ALLOC_SUCCESS, count);
> >>>   cma_sysfs_account_success_pages(cma, count);
> >>>   } else {
> >>> - count_vm_event(CMA_ALLOC_FAIL);
> >>> + count_vm_events(CMA_ALLOC_FAIL, count);
> >>>   if (cma)
> >>>   cma_sysfs_account_fail_pages(cma, count);
> >>>   }
> >>
> >> Without getting into the merits of this patch - which is actually trying 
> >> to do
> >> semantics change to /proc/vmstat, wondering how is this even related to 
> >> this
> >> particular series ? If required this could be debated on it's on 
> >> separately.
> > 
> > Having the number of CMA pages allocated and the number of CMA pages freed
> > allows someone to infer how many tagged pages are in use at a given time:
> 
> That should not be done in CMA which is a generic multi purpose allocator.

Ah, ok. Let me rephrase that: Having the number of CMA pages allocated, the
number of failed CMA page allocations and the number of freed CMA pages
allows someone to infer how many CMA pages are in use at a given time.
That's valuable information for software designers and system
administrators, as it allows them to tune the number of CMA pages available
in a system.

Or put another way: what would you consider to be more useful?  Knowing the
number of cma_alloc()/cma_release() calls, or knowing the number of pages
that cma_alloc()/cma_release() allocated or freed?

> 
> > (allocated CMA pages - CMA pages allocated by drivers* - CMA pages
> > released) * 32. That is valuable information for software and hardware
> > designers.
> > 
> > Besides that, for every iteration of the series, this has proven invaluable
> > for discovering bugs with freeing and/or reserving tag storage pages.
> 
> I am afraid that might not be enough justification for getting something
> merged mainline.
> 
> > 
> > *that would require userspace reading cma_alloc_success and
> > cma_release_success before any tagged allocations are performed.
> 
> While assuming that no other non-memory-tagged CMA based allocation amd free
> call happens in the meantime ? That would be on real thin ice.
> 
> I suppose arm64 tagged memory specific allocation or free related counters
> need to be created on the caller side, including arch_free_pages_prepare().

I'll think about this. At the very least, I can add tracepoints.

Thanks,
Alex



Re: [PATCH RFC v3 04/35] mm: page_alloc: Partially revert "mm: page_alloc: remove stale CMA guard code"

2024-01-30 Thread Alexandru Elisei
Hi,

On Tue, Jan 30, 2024 at 10:04:02AM +0530, Anshuman Khandual wrote:
> 
> 
> On 1/29/24 17:16, Alexandru Elisei wrote:
> > Hi,
> > 
> > On Mon, Jan 29, 2024 at 02:31:23PM +0530, Anshuman Khandual wrote:
> >>
> >>
> >> On 1/25/24 22:12, Alexandru Elisei wrote:
> >>> The patch f945116e4e19 ("mm: page_alloc: remove stale CMA guard code")
> >>> removed the CMA filter when allocating from the MIGRATE_MOVABLE pcp list
> >>> because CMA is always allowed when __GFP_MOVABLE is set.
> >>>
> >>> With the introduction of the arch_alloc_cma() function, the above is not
> >>> true anymore, so bring back the filter.
> >>
> >> This makes sense as arch_alloc_cma() now might prevent ALLOC_CMA being
> >> assigned to alloc_flags in gfp_to_alloc_flags_cma().
> > 
> > Can I add your Reviewed-by tag then?
> 
> I think all these changes need to be reviewed in their entirety
> even though some patches do look good on their own. For example
> this patch depends on whether [PATCH 03/35] is acceptable or not.
> 
> I would suggest separating out CMA patches which could be debated
> and merged regardless of this series.

Ah, I see, makes sense. Since basically all the core mm changes are there
to enable dynamic tag storage for arm64, I'll hold on until the series
stabilises before separating the core mm from the arm64 patches.

Thanks,
Alex



Re: [PATCH RFC v3 01/35] mm: page_alloc: Add gfp_flags parameter to arch_alloc_page()

2024-01-30 Thread Alexandru Elisei
Hi,

On Tue, Jan 30, 2024 at 09:56:10AM +0530, Anshuman Khandual wrote:
> 
> 
> On 1/29/24 17:11, Alexandru Elisei wrote:
> > Hi,
> > 
> > On Mon, Jan 29, 2024 at 11:18:59AM +0530, Anshuman Khandual wrote:
> >> On 1/25/24 22:12, Alexandru Elisei wrote:
> >>> Extend the usefulness of arch_alloc_page() by adding the gfp_flags
> >>> parameter.
> >> Although the change here is harmless in itself, it will definitely benefit
> >> from some additional context explaining the rationale, taking into account
> >> why-how arch_alloc_page() got added particularly for s390 platform and how
> >> it's going to be used in the present proposal.
> > arm64 will use it to reserve tag storage if the caller requested a tagged
> > page. Right now that means that __GFP_ZEROTAGS is set in the gfp mask, but
> > I'll rename it to __GFP_TAGGED in patch #18 ("arm64: mte: Rename
> > __GFP_ZEROTAGS to __GFP_TAGGED") [1].
> > 
> > [1] 
> > https://lore.kernel.org/lkml/20240125164256.4147-19-alexandru.eli...@arm.com/
> 
> Makes sense, but please do update the commit message explaining how
> new gfp mask argument will be used to detect tagged page allocation
> requests, further requiring tag storage allocation.

Will do, thanks!

Alex



Re: [PATCH RFC v3 23/35] arm64: mte: Try to reserve tag storage in arch_alloc_page()

2024-01-30 Thread Alexandru Elisei
Hi Peter,

On Mon, Jan 29, 2024 at 04:04:18PM -0800, Peter Collingbourne wrote:
> On Thu, Jan 25, 2024 at 8:45 AM Alexandru Elisei
>  wrote:
> >
> > Reserve tag storage for a page that is being allocated as tagged. This
> > is a best effort approach, and failing to reserve tag storage is
> > allowed.
> >
> > When all the associated tagged pages have been freed, return the tag
> > storage pages back to the page allocator, where they can be used again for
> > data allocations.
> >
> > Signed-off-by: Alexandru Elisei 
> > ---
> >
> > Changes since rfc v2:
> >
> > * Based on rfc v2 patch #16 ("arm64: mte: Manage tag storage on page
> > allocation").
> > * Fixed calculation of the number of associated tag storage blocks (Hyesoo
> > Yu).
> > * Tag storage is reserved in arch_alloc_page() instead of
> > arch_prep_new_page().
> >
> >  arch/arm64/include/asm/mte.h |  16 +-
> >  arch/arm64/include/asm/mte_tag_storage.h |  31 +++
> >  arch/arm64/include/asm/page.h|   5 +
> >  arch/arm64/include/asm/pgtable.h |  19 ++
> >  arch/arm64/kernel/mte_tag_storage.c  | 234 +++
> >  arch/arm64/mm/fault.c|   7 +
> >  fs/proc/page.c   |   1 +
> >  include/linux/kernel-page-flags.h|   1 +
> >  include/linux/page-flags.h   |   1 +
> >  include/trace/events/mmflags.h   |   3 +-
> >  mm/huge_memory.c |   1 +
> >  11 files changed, 316 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h
> > index 8034695b3dd7..6457b7899207 100644
> > --- a/arch/arm64/include/asm/mte.h
> > +++ b/arch/arm64/include/asm/mte.h
> > @@ -40,12 +40,24 @@ void mte_free_tag_buf(void *buf);
> >  #ifdef CONFIG_ARM64_MTE
> >
> >  /* track which pages have valid allocation tags */
> > -#define PG_mte_tagged  PG_arch_2
> > +#define PG_mte_tagged  PG_arch_2
> >  /* simple lock to avoid multiple threads tagging the same page */
> > -#define PG_mte_lockPG_arch_3
> > +#define PG_mte_lockPG_arch_3
> > +/* Track if a tagged page has tag storage reserved */
> > +#define PG_tag_storage_reservedPG_arch_4
> > +
> > +#ifdef CONFIG_ARM64_MTE_TAG_STORAGE
> > +DECLARE_STATIC_KEY_FALSE(tag_storage_enabled_key);
> > +extern bool page_tag_storage_reserved(struct page *page);
> > +#endif
> >
> >  static inline void set_page_mte_tagged(struct page *page)
> >  {
> > +#ifdef CONFIG_ARM64_MTE_TAG_STORAGE
> > +   /* Open code mte_tag_storage_enabled() */
> > +   WARN_ON_ONCE(static_branch_likely(_storage_enabled_key) &&
> > +!page_tag_storage_reserved(page));
> > +#endif
> > /*
> >  * Ensure that the tags written prior to this function are visible
> >  * before the page flags update.
> > diff --git a/arch/arm64/include/asm/mte_tag_storage.h 
> > b/arch/arm64/include/asm/mte_tag_storage.h
> > index 7b3f6bff8e6f..09f1318d924e 100644
> > --- a/arch/arm64/include/asm/mte_tag_storage.h
> > +++ b/arch/arm64/include/asm/mte_tag_storage.h
> > @@ -5,6 +5,12 @@
> >  #ifndef __ASM_MTE_TAG_STORAGE_H
> >  #define __ASM_MTE_TAG_STORAGE_H
> >
> > +#ifndef __ASSEMBLY__
> > +
> > +#include 
> > +
> > +#include 
> > +
> >  #ifdef CONFIG_ARM64_MTE_TAG_STORAGE
> >
> >  DECLARE_STATIC_KEY_FALSE(tag_storage_enabled_key);
> > @@ -15,6 +21,15 @@ static inline bool tag_storage_enabled(void)
> >  }
> >
> >  void mte_init_tag_storage(void);
> > +
> > +static inline bool alloc_requires_tag_storage(gfp_t gfp)
> > +{
> > +   return gfp & __GFP_TAGGED;
> > +}
> > +int reserve_tag_storage(struct page *page, int order, gfp_t gfp);
> > +void free_tag_storage(struct page *page, int order);
> > +
> > +bool page_tag_storage_reserved(struct page *page);
> >  #else
> >  static inline bool tag_storage_enabled(void)
> >  {
> > @@ -23,6 +38,22 @@ static inline bool tag_storage_enabled(void)
> >  static inline void mte_init_tag_storage(void)
> >  {
> >  }
> > +static inline bool alloc_requires_tag_storage(struct page *page)
> 
> This function should take a gfp_t to match the
> CONFIG_ARM64_MTE_TAG_STORAGE case.

Ah, yes, it should, nice catch, the compiler didn't throw an error. Will
fix, thanks!

Alex



Re: [PATCH RFC v3 08/35] mm: cma: Introduce cma_alloc_range()

2024-01-30 Thread Alexandru Elisei
Hi,

On Tue, Jan 30, 2024 at 10:50:00AM +0530, Anshuman Khandual wrote:
> 
> 
> On 1/25/24 22:12, Alexandru Elisei wrote:
> > Today, cma_alloc() is used to allocate a contiguous memory region. The
> > function allows the caller to specify the number of pages to allocate, but
> > not the starting address. cma_alloc() will walk over the entire CMA region
> > trying to allocate the first available range of the specified size.
> > 
> > Introduce cma_alloc_range(), which makes CMA more versatile by allowing the
> > caller to specify a particular range in the CMA region, defined by the
> > start pfn and the size.
> > 
> > arm64 will make use of this function when tag storage management will be
> > implemented: cma_alloc_range() will be used to reserve the tag storage
> > associated with a tagged page.
> 
> Basically, you would like to pass on a preferred start address and the
> allocation could just fail if a contig range is not available from such
> a starting address ?
> 
> Then why not just change cma_alloc() to take a new argument 'start_pfn'.
> Why create a new but almost similar allocator ?

I tried doing that, and I gave up because:

- It made cma_alloc() even more complex and hard to follow.

- What value should 'start_pfn' be to tell cma_alloc() that it should be
  ignored? Or, to put it another way, what pfn number is invalid on **all**
  platforms that Linux supports?

I can give it another go if we can come up with an invalid value for
'start_pfn'.

> 
> But then I am wondering why this could not be done in the arm64 platform
> code itself operating on a CMA area reserved just for tag storage. Unless
> this new allocator has other usage beyond MTE, this could be implemented
> in the platform itself.

I had the same idea in the previous iteration, David Hildenbrand suggested
this approach [1].

[1] 
https://lore.kernel.org/linux-fsdevel/2aafd53f-af1f-45f3-a08c-d11962254...@redhat.com/

Thanks,
Alex

> 
> > 
> > Signed-off-by: Alexandru Elisei 
> > ---
> > 
> > Changes since rfc v2:
> > 
> > * New patch.
> > 
> >  include/linux/cma.h|  2 +
> >  include/trace/events/cma.h | 59 ++
> >  mm/cma.c   | 86 ++
> >  3 files changed, 147 insertions(+)
> > 
> > diff --git a/include/linux/cma.h b/include/linux/cma.h
> > index 63873b93deaa..e32559da6942 100644
> > --- a/include/linux/cma.h
> > +++ b/include/linux/cma.h
> > @@ -50,6 +50,8 @@ extern int cma_init_reserved_mem(phys_addr_t base, 
> > phys_addr_t size,
> > struct cma **res_cma);
> >  extern struct page *cma_alloc(struct cma *cma, unsigned long count, 
> > unsigned int align,
> >   bool no_warn);
> > +extern int cma_alloc_range(struct cma *cma, unsigned long start, unsigned 
> > long count,
> > +  unsigned tries, gfp_t gfp);
> >  extern bool cma_pages_valid(struct cma *cma, const struct page *pages, 
> > unsigned long count);
> >  extern bool cma_release(struct cma *cma, const struct page *pages, 
> > unsigned long count);
> >  
> > diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h
> > index 25103e67737c..a89af313a572 100644
> > --- a/include/trace/events/cma.h
> > +++ b/include/trace/events/cma.h
> > @@ -36,6 +36,65 @@ TRACE_EVENT(cma_release,
> >   __entry->count)
> >  );
> >  
> > +TRACE_EVENT(cma_alloc_range_start,
> > +
> > +   TP_PROTO(const char *name, unsigned long start, unsigned long count,
> > +unsigned tries),
> > +
> > +   TP_ARGS(name, start, count, tries),
> > +
> > +   TP_STRUCT__entry(
> > +   __string(name, name)
> > +   __field(unsigned long, start)
> > +   __field(unsigned long, count)
> > +   __field(unsigned, tries)
> > +   ),
> > +
> > +   TP_fast_assign(
> > +   __assign_str(name, name);
> > +   __entry->start = start;
> > +   __entry->count = count;
> > +   __entry->tries = tries;
> > +   ),
> > +
> > +   TP_printk("name=%s start=%lx count=%lu tries=%u",
> > + __get_str(name),
> > + __entry->start,
> > + __entry->count,
> > + __entry->tries)
> > +);
> > +
> > +TRACE_EVENT(cma_alloc_range_finish,
> > +
> > +   TP_PROTO(const char *name, unsigned long start, unsigned long count,
> > +unsigned attempts, int err),
> > +
> > +   TP

Re: [PATCH RFC v3 11/35] mm: Allow an arch to hook into folio allocation when VMA is known

2024-01-30 Thread Alexandru Elisei
Hi,

On Tue, Jan 30, 2024 at 03:25:20PM +0530, Anshuman Khandual wrote:
> 
> 
> On 1/25/24 22:12, Alexandru Elisei wrote:
> > arm64 uses VM_HIGH_ARCH_0 and VM_HIGH_ARCH_1 for enabling MTE for a VMA.
> > When VM_HIGH_ARCH_0, which arm64 renames to VM_MTE, is set for a VMA, and
> > the gfp flag __GFP_ZERO is present, the __GFP_ZEROTAGS gfp flag also gets
> > set in vma_alloc_zeroed_movable_folio().
> > 
> > Expand this to be more generic by adding an arch hook that modifes the gfp
> > flags for an allocation when the VMA is known.
> > 
> > Note that __GFP_ZEROTAGS is ignored by the page allocator unless __GFP_ZERO
> > is also set; from that point of view, the current behaviour is unchanged,
> > even though the arm64 flag is set in more places.  When arm64 will have
> > support to reuse the tag storage for data allocation, the uses of the
> > __GFP_ZEROTAGS flag will be expanded to instruct the page allocator to try
> > to reserve the corresponding tag storage for the pages being allocated.
> 
> Right but how will pushing __GFP_ZEROTAGS addition into gfp_t flags further
> down via a new arch call back i.e arch_calc_vma_gfp() while still maintaining
> (vma->vm_flags & VM_MTE) conditionality improve the current scenario. Because

I'm afraid I don't follow you.

> the page allocator could have still analyzed alloc flags for __GFP_ZEROTAGS
> for any additional stuff.
> 
> OR this just adds some new core MM paths to get __GFP_ZEROTAGS which was not
> the case earlier via this call back.

Before this patch: vma_alloc_zeroed_movable_folio() sets __GFP_ZEROTAGS.
After this patch: vma_alloc_folio() sets __GFP_ZEROTAGS.

This patch is about adding __GFP_ZEROTAGS for more callers.

Thanks,
Alex

> 
> > 
> > The flags returned by arch_calc_vma_gfp() are or'ed with the flags set by
> > the caller; this has been done to keep an architecture from modifying the
> > flags already set by the core memory management code; this is similar to
> > how do_mmap() -> calc_vm_flag_bits() -> arch_calc_vm_flag_bits() has been
> > implemented. This can be revisited in the future if there's a need to do
> > so.
> > 
> > Signed-off-by: Alexandru Elisei 
> > ---
> >  arch/arm64/include/asm/page.h|  5 ++---
> >  arch/arm64/include/asm/pgtable.h |  3 +++
> >  arch/arm64/mm/fault.c| 19 ++-
> >  include/linux/pgtable.h  |  7 +++
> >  mm/mempolicy.c   |  1 +
> >  mm/shmem.c   |  5 -
> >  6 files changed, 23 insertions(+), 17 deletions(-)
> > 
> > diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h
> > index 2312e6ee595f..88bab032a493 100644
> > --- a/arch/arm64/include/asm/page.h
> > +++ b/arch/arm64/include/asm/page.h
> > @@ -29,9 +29,8 @@ void copy_user_highpage(struct page *to, struct page 
> > *from,
> >  void copy_highpage(struct page *to, struct page *from);
> >  #define __HAVE_ARCH_COPY_HIGHPAGE
> >  
> > -struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma,
> > -   unsigned long vaddr);
> > -#define vma_alloc_zeroed_movable_folio vma_alloc_zeroed_movable_folio
> > +#define vma_alloc_zeroed_movable_folio(vma, vaddr) \
> > +   vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false)
> >  
> >  void tag_clear_highpage(struct page *to);
> >  #define __HAVE_ARCH_TAG_CLEAR_HIGHPAGE
> > diff --git a/arch/arm64/include/asm/pgtable.h 
> > b/arch/arm64/include/asm/pgtable.h
> > index 79ce70fbb751..08f0904dbfc2 100644
> > --- a/arch/arm64/include/asm/pgtable.h
> > +++ b/arch/arm64/include/asm/pgtable.h
> > @@ -1071,6 +1071,9 @@ static inline void arch_swap_restore(swp_entry_t 
> > entry, struct folio *folio)
> >  
> >  #endif /* CONFIG_ARM64_MTE */
> >  
> > +#define __HAVE_ARCH_CALC_VMA_GFP
> > +gfp_t arch_calc_vma_gfp(struct vm_area_struct *vma, gfp_t gfp);
> > +
> >  /*
> >   * On AArch64, the cache coherency is handled via the set_pte_at() 
> > function.
> >   */
> > diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
> > index 55f6455a8284..4d3f0a870ad8 100644
> > --- a/arch/arm64/mm/fault.c
> > +++ b/arch/arm64/mm/fault.c
> > @@ -937,22 +937,15 @@ void do_debug_exception(unsigned long 
> > addr_if_watchpoint, unsigned long esr,
> >  NOKPROBE_SYMBOL(do_debug_exception);
> >  
> >  /*
> > - * Used during anonymous page fault handling.
> > + * If this is called during anonymous page fault handling, and the page is
> > + * mapped with PROT

Re: [PATCH RFC v3 10/35] mm: cma: Fast track allocating memory when the pages are free

2024-01-30 Thread Alexandru Elisei
Hi,

On Tue, Jan 30, 2024 at 02:48:53PM +0530, Anshuman Khandual wrote:
> 
> 
> On 1/25/24 22:12, Alexandru Elisei wrote:
> > If the pages to be allocated are free, take them directly off the buddy
> > allocator, instead of going through alloc_contig_range() and avoiding
> > costly calls to lru_cache_disable().
> > 
> > Only allocations of the same size as the CMA region order are considered,
> > to avoid taking the zone spinlock for too long.
> > 
> > Signed-off-by: Alexandru Elisei 
> 
> This patch seems to be improving standard cma_alloc() as well as
> the previously added new allocator i.e cma_alloc_range() - via a
> new helper cma_alloc_pages_fastpath().

Yes, that's correct.

> 
> Should not any standard cma_alloc() improvement be discussed as
> an independent patch separately irrespective of this series. OR
> it is some how related to this series which I might be missing ?

Yes, it's related to this series. I wrote this patch because it fixes a
performance regression with Chrome when dynamic tag storage management is
enabled [1]. I will bring back the commit message explaining that.

[1] 
https://lore.kernel.org/linux-fsdevel/20231119165721.9849-27-alexandru.eli...@arm.com/

Thanks,
Alex

> 
> > ---
> > 
> > Changes since rfc v2:
> > 
> > * New patch. Reworked from the rfc v2 patch #26 ("arm64: mte: Fast track
> > reserving tag storage when the block is free") (David Hildenbrand).
> > 
> >  include/linux/page-flags.h | 15 --
> >  mm/Kconfig |  5 +
> >  mm/cma.c   | 42 ++
> >  mm/memory-failure.c|  8 
> >  mm/page_alloc.c| 23 -
> >  5 files changed, 73 insertions(+), 20 deletions(-)
> > 
> > diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> > index 735cddc13d20..b7237bce7446 100644
> > --- a/include/linux/page-flags.h
> > +++ b/include/linux/page-flags.h
> > @@ -575,11 +575,22 @@ TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
> >  #define MAGIC_HWPOISON 0x48575053U /* HWPS */
> >  extern void SetPageHWPoisonTakenOff(struct page *page);
> >  extern void ClearPageHWPoisonTakenOff(struct page *page);
> > -extern bool take_page_off_buddy(struct page *page);
> > -extern bool put_page_back_buddy(struct page *page);
> > +extern bool PageHWPoisonTakenOff(struct page *page);
> >  #else
> >  PAGEFLAG_FALSE(HWPoison, hwpoison)
> > +TESTSCFLAG_FALSE(HWPoison, hwpoison)
> >  #define __PG_HWPOISON 0
> > +static inline void SetPageHWPoisonTakenOff(struct page *page) { }
> > +static inline void ClearPageHWPoisonTakenOff(struct page *page) { }
> > +static inline bool PageHWPoisonTakenOff(struct page *page)
> > +{
> > +  return false;
> > +}
> > +#endif
> > +
> > +#ifdef CONFIG_WANTS_TAKE_PAGE_OFF_BUDDY
> > +extern bool take_page_off_buddy(struct page *page, bool poison);
> > +extern bool put_page_back_buddy(struct page *page, bool unpoison);
> >  #endif
> >  
> >  #if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
> > diff --git a/mm/Kconfig b/mm/Kconfig
> > index ffc3a2ba3a8c..341cf53898db 100644
> > --- a/mm/Kconfig
> > +++ b/mm/Kconfig
> > @@ -745,12 +745,16 @@ config DEFAULT_MMAP_MIN_ADDR
> >  config ARCH_SUPPORTS_MEMORY_FAILURE
> > bool
> >  
> > +config WANTS_TAKE_PAGE_OFF_BUDDY
> > +   bool> +
> >  config MEMORY_FAILURE
> > depends on MMU
> > depends on ARCH_SUPPORTS_MEMORY_FAILURE
> > bool "Enable recovery from hardware memory errors"
> > select MEMORY_ISOLATION
> > select RAS
> > +   select WANTS_TAKE_PAGE_OFF_BUDDY
> > help
> >   Enables code to recover from some memory failures on systems
> >   with MCA recovery. This allows a system to continue running
> > @@ -891,6 +895,7 @@ config CMA
> > depends on MMU
> > select MIGRATION
> > select MEMORY_ISOLATION
> > +   select WANTS_TAKE_PAGE_OFF_BUDDY
> > help
> >   This enables the Contiguous Memory Allocator which allows other
> >   subsystems to allocate big physically-contiguous blocks of memory.
> > diff --git a/mm/cma.c b/mm/cma.c
> > index 2881bab12b01..15663f95d77b 100644
> > --- a/mm/cma.c
> > +++ b/mm/cma.c
> > @@ -444,6 +444,34 @@ static void cma_debug_show_areas(struct cma *cma)
> >  static inline void cma_debug_show_areas(struct cma *cma) { }
> >  #endif
> >  
> > +/* Called with the cma mutex held. */
> &

Re: [PATCH RFC v3 09/35] mm: cma: Introduce cma_remove_mem()

2024-01-30 Thread Alexandru Elisei
Hi,

I really appreciate the feedback you have given me so far. I believe the
commit message isn't clear enough and there has been a confusion.

A CMA user adds a CMA area to the cma_areas array with
cma_declare_contiguous_nid() or cma_init_reserved_mem().
init_cma_reserved_pageblock() then iterates over the array and activates
all cma areas.

The function cma_remove_mem() is intended to be used to remove a cma area
from the cma_areas array **before** the area has been activated.

Usecase: a driver (in this case, the arm64 dynamic tag storage code)
manages several cma areas. The driver successfully adds the first area to
the cma_areas array. When the driver tries to adds the second area, the
function fails. Without cma_remove_mem(), the driver has no way to prevent
the first area from being freed to the page allocator. cma_remove_mem() is
about providing a means to do cleanup in case of error.

Does that make more sense now?

Ok Tue, Jan 30, 2024 at 11:20:56AM +0530, Anshuman Khandual wrote:
> 
> 
> On 1/25/24 22:12, Alexandru Elisei wrote:
> > Memory is added to CMA with cma_declare_contiguous_nid() and
> > cma_init_reserved_mem(). This memory is then put on the MIGRATE_CMA list in
> > cma_init_reserved_areas(), where the page allocator can make use of it.
> 
> cma_declare_contiguous_nid() reserves memory in memblock and marks the

You forgot about about cma_init_reserved_mem() which does the same thing,
but yes, you are right.

> for subsequent CMA usage, where as cma_init_reserved_areas() activates
> these memory areas through init_cma_reserved_pageblock(). Standard page
> allocator only receives these memory via free_reserved_page() - only if

I don't think that's correct. init_cma_reserved_pageblock() clears the
PG_reserved page flag, sets the migratetype to MIGRATE_CMA and then frees
the page. After that, the page is available to the standard page allocator
to use for allocation. Otherwise, what would be the point of the
MIGRATE_CMA migratetype?

> the page block activation fails.

For the sake of having a complete picture, I'll add that that only happens
if cma->reserve_pages_on_error is false. If the CMA user sets the field to
'true' (with cma_reserve_pages_on_error()), then the pages in the CMA
region are kept PG_reserved if activation fails.

> 
> > 
> > If a device manages multiple CMA areas, and there's an error when one of
> > the areas is added to CMA, there is no mechanism for the device to prevent
> 
> What kind of error ? init_cma_reserved_pageblock() fails ? But that will
> not happen until cma_init_reserved_areas().

I think I haven't been clear enough. When I say that "an area is added
to CMA", I mean that the memory region is added to cma_areas array, via
cma_declare_contiguous_nid() or cma_init_reserved_mem(). There are several
ways in which either function can fail.

> 
> > the rest of the areas, which were added before the error occured, from
> > being later added to the MIGRATE_CMA list.
> 
> Why is this mechanism required ? cma_init_reserved_areas() scans over all
> CMA areas and try and activate each of them sequentially. Why is not this
> sufficient ?

This patch is about removing a struct cma from the cma_areas array after it
has been added to the array, with cma_declare_contiguous_nid() or
cma_init_reserved_mem(), to prevent the area from being activated in
cma_init_reserved_areas(). Sorry for the confusion.

I'll add a check in cma_remove_mem() to fail if the cma area has been
activated, and a comment to the function to explain its usage.

> 
> > 
> > Add cma_remove_mem() which allows a previously reserved CMA area to be
> > removed and thus it cannot be used by the page allocator.
> 
> Successfully activated CMA areas do not get used by the buddy allocator.

I don't believe that is correct, see above.

> 
> > 
> > Signed-off-by: Alexandru Elisei 
> > ---
> > 
> > Changes since rfc v2:
> > 
> > * New patch.
> > 
> >  include/linux/cma.h |  1 +
> >  mm/cma.c| 30 +-
> >  2 files changed, 30 insertions(+), 1 deletion(-)
> > 
> > diff --git a/include/linux/cma.h b/include/linux/cma.h
> > index e32559da6942..787cbec1702e 100644
> > --- a/include/linux/cma.h
> > +++ b/include/linux/cma.h
> > @@ -48,6 +48,7 @@ extern int cma_init_reserved_mem(phys_addr_t base, 
> > phys_addr_t size,
> > unsigned int order_per_bit,
> > const char *name,
> > struct cma **res_cma);
> > +extern void cma_remove_mem(struct cma **res_cma);
> >  extern struct page *cma_alloc(struct cma *cma, unsigned long count, 
> > unsigned int align,
> >   b

Re: [PATCH RFC v3 11/35] mm: Allow an arch to hook into folio allocation when VMA is known

2024-01-29 Thread Alexandru Elisei
Hi Peter,

On Fri, Jan 26, 2024 at 12:00:36PM -0800, Peter Collingbourne wrote:
> On Thu, Jan 25, 2024 at 8:43 AM Alexandru Elisei
>  wrote:
> >
> > arm64 uses VM_HIGH_ARCH_0 and VM_HIGH_ARCH_1 for enabling MTE for a VMA.
> > When VM_HIGH_ARCH_0, which arm64 renames to VM_MTE, is set for a VMA, and
> > the gfp flag __GFP_ZERO is present, the __GFP_ZEROTAGS gfp flag also gets
> > set in vma_alloc_zeroed_movable_folio().
> >
> > Expand this to be more generic by adding an arch hook that modifes the gfp
> > flags for an allocation when the VMA is known.
> >
> > Note that __GFP_ZEROTAGS is ignored by the page allocator unless __GFP_ZERO
> > is also set; from that point of view, the current behaviour is unchanged,
> > even though the arm64 flag is set in more places.  When arm64 will have
> > support to reuse the tag storage for data allocation, the uses of the
> > __GFP_ZEROTAGS flag will be expanded to instruct the page allocator to try
> > to reserve the corresponding tag storage for the pages being allocated.
> >
> > The flags returned by arch_calc_vma_gfp() are or'ed with the flags set by
> > the caller; this has been done to keep an architecture from modifying the
> > flags already set by the core memory management code; this is similar to
> > how do_mmap() -> calc_vm_flag_bits() -> arch_calc_vm_flag_bits() has been
> > implemented. This can be revisited in the future if there's a need to do
> > so.
> >
> > Signed-off-by: Alexandru Elisei 
> 
> This patch also needs to update the non-CONFIG_NUMA definition of
> vma_alloc_folio in include/linux/gfp.h to call arch_calc_vma_gfp. See:
> https://r.android.com/2849146

Of course, you're already reported this to me, I cherry-pick the version of
the patch that doesn't have the fix for this series.

Will fix.

Thanks,
Alex

> 
> Peter



Re: [PATCH RFC v3 07/35] mm: cma: Add CMA_RELEASE_{SUCCESS,FAIL} events

2024-01-29 Thread Alexandru Elisei
Hi,

On Mon, Jan 29, 2024 at 03:01:24PM +0530, Anshuman Khandual wrote:
> 
> 
> On 1/25/24 22:12, Alexandru Elisei wrote:
> > Similar to the two events that relate to CMA allocations, add the
> > CMA_RELEASE_SUCCESS and CMA_RELEASE_FAIL events that count when CMA pages
> > are freed.
> 
> How is this is going to be beneficial towards analyzing CMA alloc/release
> behaviour - particularly with respect to this series. OR just adding this
> from parity perspective with CMA alloc side counters ? Regardless this
> CMA change too could be discussed separately.

Added for parity and because it's useful for this series (see my reply to
the previous patch where I discuss how I've used the counters).

Thanks,
Alex

> 
> > 
> > Signed-off-by: Alexandru Elisei 
> > ---
> > 
> > Changes since rfc v2:
> > 
> > * New patch.
> > 
> >  include/linux/vm_event_item.h | 2 ++
> >  mm/cma.c  | 6 +-
> >  mm/vmstat.c   | 2 ++
> >  3 files changed, 9 insertions(+), 1 deletion(-)
> > 
> > diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
> > index 747943bc8cc2..aba5c5bf8127 100644
> > --- a/include/linux/vm_event_item.h
> > +++ b/include/linux/vm_event_item.h
> > @@ -83,6 +83,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
> >  #ifdef CONFIG_CMA
> > CMA_ALLOC_SUCCESS,
> > CMA_ALLOC_FAIL,
> > +   CMA_RELEASE_SUCCESS,
> > +   CMA_RELEASE_FAIL,
> >  #endif
> > UNEVICTABLE_PGCULLED,   /* culled to noreclaim list */
> > UNEVICTABLE_PGSCANNED,  /* scanned for reclaimability */
> > diff --git a/mm/cma.c b/mm/cma.c
> > index dbf7fe8cb1bd..543bb6b3be8e 100644
> > --- a/mm/cma.c
> > +++ b/mm/cma.c
> > @@ -562,8 +562,10 @@ bool cma_release(struct cma *cma, const struct page 
> > *pages,
> >  {
> > unsigned long pfn;
> >  
> > -   if (!cma_pages_valid(cma, pages, count))
> > +   if (!cma_pages_valid(cma, pages, count)) {
> > +   count_vm_events(CMA_RELEASE_FAIL, count);
> > return false;
> > +   }
> >  
> > pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count);
> >  
> > @@ -575,6 +577,8 @@ bool cma_release(struct cma *cma, const struct page 
> > *pages,
> > cma_clear_bitmap(cma, pfn, count);
> > trace_cma_release(cma->name, pfn, pages, count);
> >  
> > +   count_vm_events(CMA_RELEASE_SUCCESS, count);
> > +
> > return true;
> >  }
> >  
> > diff --git a/mm/vmstat.c b/mm/vmstat.c
> > index db79935e4a54..eebfd5c6c723 100644
> > --- a/mm/vmstat.c
> > +++ b/mm/vmstat.c
> > @@ -1340,6 +1340,8 @@ const char * const vmstat_text[] = {
> >  #ifdef CONFIG_CMA
> > "cma_alloc_success",
> > "cma_alloc_fail",
> > +   "cma_release_success",
> > +   "cma_release_fail",
> >  #endif
> > "unevictable_pgs_culled",
> > "unevictable_pgs_scanned",



Re: [PATCH RFC v3 06/35] mm: cma: Make CMA_ALLOC_SUCCESS/FAIL count the number of pages

2024-01-29 Thread Alexandru Elisei
Hi,

On Mon, Jan 29, 2024 at 02:54:20PM +0530, Anshuman Khandual wrote:
> 
> 
> On 1/25/24 22:12, Alexandru Elisei wrote:
> > The CMA_ALLOC_SUCCESS, respectively CMA_ALLOC_FAIL, are increased by one
> > after each cma_alloc() function call. This is done even though cma_alloc()
> > can allocate an arbitrary number of CMA pages. When looking at
> > /proc/vmstat, the number of successful (or failed) cma_alloc() calls
> > doesn't tell much with regards to how many CMA pages were allocated via
> > cma_alloc() versus via the page allocator (regular allocation request or
> > PCP lists refill).
> > 
> > This can also be rather confusing to a user who isn't familiar with the
> > code, since the unit of measurement for nr_free_cma is the number of pages,
> > but cma_alloc_success and cma_alloc_fail count the number of cma_alloc()
> > function calls.
> > 
> > Let's make this consistent, and arguably more useful, by having
> > CMA_ALLOC_SUCCESS count the number of successfully allocated CMA pages, and
> > CMA_ALLOC_FAIL count the number of pages the cma_alloc() failed to
> > allocate.
> > 
> > For users that wish to track the number of cma_alloc() calls, there are
> > tracepoints for that already implemented.
> > 
> > Signed-off-by: Alexandru Elisei 
> > ---
> >  mm/cma.c | 4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> > 
> > diff --git a/mm/cma.c b/mm/cma.c
> > index f49c95f8ee37..dbf7fe8cb1bd 100644
> > --- a/mm/cma.c
> > +++ b/mm/cma.c
> > @@ -517,10 +517,10 @@ struct page *cma_alloc(struct cma *cma, unsigned long 
> > count,
> > pr_debug("%s(): returned %p\n", __func__, page);
> >  out:
> > if (page) {
> > -   count_vm_event(CMA_ALLOC_SUCCESS);
> > +   count_vm_events(CMA_ALLOC_SUCCESS, count);
> > cma_sysfs_account_success_pages(cma, count);
> > } else {
> > -   count_vm_event(CMA_ALLOC_FAIL);
> > +   count_vm_events(CMA_ALLOC_FAIL, count);
> > if (cma)
> > cma_sysfs_account_fail_pages(cma, count);
> > }
> 
> Without getting into the merits of this patch - which is actually trying to do
> semantics change to /proc/vmstat, wondering how is this even related to this
> particular series ? If required this could be debated on it's on separately.

Having the number of CMA pages allocated and the number of CMA pages freed
allows someone to infer how many tagged pages are in use at a given time:
(allocated CMA pages - CMA pages allocated by drivers* - CMA pages
released) * 32. That is valuable information for software and hardware
designers.

Besides that, for every iteration of the series, this has proven invaluable
for discovering bugs with freeing and/or reserving tag storage pages.

*that would require userspace reading cma_alloc_success and
cma_release_success before any tagged allocations are performed.

Thanks,
Alex



Re: [PATCH RFC v3 05/35] mm: cma: Don't append newline when generating CMA area name

2024-01-29 Thread Alexandru Elisei
Hi,

On Mon, Jan 29, 2024 at 02:43:08PM +0530, Anshuman Khandual wrote:
> 
> On 1/25/24 22:12, Alexandru Elisei wrote:
> > cma->name is displayed in several CMA messages. When the name is generated
> > by the CMA code, don't append a newline to avoid breaking the text across
> > two lines.
> 
> An example of such mis-formatted CMA output from dmesg could be added
> here in the commit message to demonstrate the problem better.
> 
> > 
> > Signed-off-by: Alexandru Elisei 
> > ---
> 
> Regardless, LGTM.
> 
> Reviewed-by: Anshuman Khandual 

Thanks!

> 
> > 
> > Changes since rfc v2:
> > 
> > * New patch. This is a fix, and can be merged independently of the other
> > patches.
> 
> Right, need not be part of this series. Hence please send it separately to
> the MM list.

Will do!

Alex

> 
> > 
> >  mm/cma.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/mm/cma.c b/mm/cma.c
> > index 7c09c47e530b..f49c95f8ee37 100644
> > --- a/mm/cma.c
> > +++ b/mm/cma.c
> > @@ -204,7 +204,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, 
> > phys_addr_t size,
> > if (name)
> > snprintf(cma->name, CMA_MAX_NAME, name);
> > else
> > -   snprintf(cma->name, CMA_MAX_NAME,  "cma%d\n", cma_area_count);
> > +   snprintf(cma->name, CMA_MAX_NAME,  "cma%d", cma_area_count);
> >  
> > cma->base_pfn = PFN_DOWN(base);
> > cma->count = size >> PAGE_SHIFT;



Re: [PATCH RFC v3 04/35] mm: page_alloc: Partially revert "mm: page_alloc: remove stale CMA guard code"

2024-01-29 Thread Alexandru Elisei
Hi,

On Mon, Jan 29, 2024 at 02:31:23PM +0530, Anshuman Khandual wrote:
> 
> 
> On 1/25/24 22:12, Alexandru Elisei wrote:
> > The patch f945116e4e19 ("mm: page_alloc: remove stale CMA guard code")
> > removed the CMA filter when allocating from the MIGRATE_MOVABLE pcp list
> > because CMA is always allowed when __GFP_MOVABLE is set.
> > 
> > With the introduction of the arch_alloc_cma() function, the above is not
> > true anymore, so bring back the filter.
> 
> This makes sense as arch_alloc_cma() now might prevent ALLOC_CMA being
> assigned to alloc_flags in gfp_to_alloc_flags_cma().

Can I add your Reviewed-by tag then?

Thanks,
Alex

> 
> > 
> > This is a partially revert because the stale comment remains removed.
> > 
> > Signed-off-by: Alexandru Elisei 
> > ---
> >  mm/page_alloc.c | 15 +++
> >  1 file changed, 11 insertions(+), 4 deletions(-)
> > 
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index a96d47a6393e..0fa34bcfb1af 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -2897,10 +2897,17 @@ struct page *rmqueue(struct zone *preferred_zone,
> > WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
> >  
> > if (likely(pcp_allowed_order(order))) {
> > -   page = rmqueue_pcplist(preferred_zone, zone, order,
> > -  migratetype, alloc_flags);
> > -   if (likely(page))
> > -   goto out;
> > +   /*
> > +* MIGRATE_MOVABLE pcplist could have the pages on CMA area and
> > +* we need to skip it when CMA area isn't allowed.
> > +*/
> > +   if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA ||
> > +   migratetype != MIGRATE_MOVABLE) {
> > +   page = rmqueue_pcplist(preferred_zone, zone, order,
> > +   migratetype, alloc_flags);
> > +   if (likely(page))
> > +   goto out;
> > +   }
> > }
> >  
> > page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags,



Re: [PATCH RFC v3 03/35] mm: page_alloc: Add an arch hook to filter MIGRATE_CMA allocations

2024-01-29 Thread Alexandru Elisei
Hi,

On Mon, Jan 29, 2024 at 02:14:16PM +0530, Anshuman Khandual wrote:
> 
> 
> On 1/25/24 22:12, Alexandru Elisei wrote:
> > As an architecture might have specific requirements around the allocation
> > of CMA pages, add an arch hook that can disable allocations from
> > MIGRATE_CMA, if the allocation was otherwise allowed.
> > 
> > This will be used by arm64, which will put tag storage pages on the
> > MIGRATE_CMA list, and tag storage pages cannot be tagged. The filter will
> > be used to deny using MIGRATE_CMA for __GFP_TAGGED allocations.
> 
> Just wondering how allocation requests would be blocked for direct
> alloc_contig_range() requests ?

alloc_contig_range() does page allocation in __alloc_contig_migrate_range()
-> alloc_migration_target(); __alloc_contig_migrate_range() ignores the
gfp_mask parameter passed to alloc_contig_range() when building struct
migration_target_control, even though it's available in the struct
compact_control argument. That looks like a bug to me, as the decription
for the gfp_mask parameter says: "GFP mask to use during compaction".

Regardless, when tag storage page T1 is migrated to it can be used to
storage tags, it doesn't matter if it is replaced by another tag storage
page T2 or a regular page, as long as the replacement isn't also tagged. If
the replacement is also tagged, the code to reserve tag storage would
recurse and deadlock. See patch #16 ("KVM: arm64: Don't deny VM_PFNMAP VMAs
when kvm_has_mte()") [1] for the code.

Does that make sense?

[1] 
https://lore.kernel.org/linux-mm/20240125164256.4147-24-alexandru.eli...@arm.com/

> 
> > 
> > Signed-off-by: Alexandru Elisei 
> > ---
> >  include/linux/pgtable.h | 7 +++
> >  mm/page_alloc.c | 3 ++-
> >  2 files changed, 9 insertions(+), 1 deletion(-)
> > 
> > diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> > index 6d98d5fdd697..c5ddec6b5305 100644
> > --- a/include/linux/pgtable.h
> > +++ b/include/linux/pgtable.h
> > @@ -905,6 +905,13 @@ static inline void arch_do_swap_page(struct mm_struct 
> > *mm,
> >  static inline void arch_free_pages_prepare(struct page *page, int order) { 
> > }
> >  #endif
> >  
> > +#ifndef __HAVE_ARCH_ALLOC_CMA
> 
> Same as last patch i.e __HAVE_ARCH_ALLOC_CMA could be avoided via
> a direct check on #ifndef arch_alloc_cma instead.

include/linux/pgtable.h uses __HAVE_ARCH_*, and I would rather keep it
consistent.

Thanks,
Alex

> 
> > +static inline bool arch_alloc_cma(gfp_t gfp)
> > +{
> > +   return true;
> > +}
> > +#endif
> > +
> >  #ifndef __HAVE_ARCH_UNMAP_ONE
> >  /*
> >   * Some architectures support metadata associated with a page. When a
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index 27282a1c82fe..a96d47a6393e 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -3157,7 +3157,8 @@ static inline unsigned int 
> > gfp_to_alloc_flags_cma(gfp_t gfp_mask,
> >   unsigned int alloc_flags)
> >  {
> >  #ifdef CONFIG_CMA
> > -   if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
> > +   if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE &&
> > +   arch_alloc_cma(gfp_mask))
> > alloc_flags |= ALLOC_CMA;
> >  #endif
> > return alloc_flags;



Re: [PATCH RFC v3 02/35] mm: page_alloc: Add an arch hook early in free_pages_prepare()

2024-01-29 Thread Alexandru Elisei
Hi,

On Mon, Jan 29, 2024 at 01:49:44PM +0530, Anshuman Khandual wrote:
> 
> 
> On 1/25/24 22:12, Alexandru Elisei wrote:
> > The arm64 MTE code uses the PG_arch_2 page flag, which it renames to
> > PG_mte_tagged, to track if a page has been mapped with tagging enabled.
> > That flag is cleared by free_pages_prepare() by doing:
> > 
> > page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
> > 
> > When tag storage management is added, tag storage will be reserved for a
> > page if and only if the page is mapped as tagged (the page flag
> > PG_mte_tagged is set). When a page is freed, likewise, the code will have
> > to look at the the page flags to determine if the page has tag storage
> > reserved, which should also be freed.
> > 
> > For this purpose, add an arch_free_pages_prepare() hook that is called
> > before that page flags are cleared. The function arch_free_page() has also
> > been considered for this purpose, but it is called after the flags are
> > cleared.
> 
> arch_free_pages_prepare() makes sense as a prologue to arch_free_page().  

Thanks!

> 
> s/arch_free_pages_prepare/arch_free_page_prepare to match similar functions.

The function free_pages_prepare() calls the function arch_free_pages_prepare().
I find that consistent, and it makes it easy to identify from where
arch_free_pages_prepare() is called.

Thanks,
Alex

> 
> > 
> > Signed-off-by: Alexandru Elisei 
> > ---
> > 
> > Changes since rfc v2:
> > 
> > * Expanded commit message (David Hildenbrand).
> > 
> >  include/linux/pgtable.h | 4 
> >  mm/page_alloc.c | 1 +
> >  2 files changed, 5 insertions(+)
> > 
> > diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> > index f6d0e3513948..6d98d5fdd697 100644
> > --- a/include/linux/pgtable.h
> > +++ b/include/linux/pgtable.h
> > @@ -901,6 +901,10 @@ static inline void arch_do_swap_page(struct mm_struct 
> > *mm,
> >  }
> >  #endif
> >  
> > +#ifndef __HAVE_ARCH_FREE_PAGES_PREPARE
> 
> I guess new __HAVE_ARCH_ constructs are not being added lately. Instead
> something like '#ifndef arch_free_pages_prepare' might be better suited.
> 
> > +static inline void arch_free_pages_prepare(struct page *page, int order) { 
> > }
> > +#endif
> > +
> >  #ifndef __HAVE_ARCH_UNMAP_ONE
> >  /*
> >   * Some architectures support metadata associated with a page. When a
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index 2c140abe5ee6..27282a1c82fe 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -1092,6 +1092,7 @@ static __always_inline bool free_pages_prepare(struct 
> > page *page,
> >  
> > trace_mm_page_free(page, order);
> > kmsan_free_page(page, order);
> > +   arch_free_pages_prepare(page, order);
> >  
> > if (memcg_kmem_online() && PageMemcgKmem(page))
> > __memcg_kmem_uncharge_page(page, order);



Re: [PATCH RFC v3 01/35] mm: page_alloc: Add gfp_flags parameter to arch_alloc_page()

2024-01-29 Thread Alexandru Elisei
Hi,

On Mon, Jan 29, 2024 at 11:18:59AM +0530, Anshuman Khandual wrote:
> 
> On 1/25/24 22:12, Alexandru Elisei wrote:
> > Extend the usefulness of arch_alloc_page() by adding the gfp_flags
> > parameter.
> 
> Although the change here is harmless in itself, it will definitely benefit
> from some additional context explaining the rationale, taking into account
> why-how arch_alloc_page() got added particularly for s390 platform and how
> it's going to be used in the present proposal.

arm64 will use it to reserve tag storage if the caller requested a tagged
page. Right now that means that __GFP_ZEROTAGS is set in the gfp mask, but
I'll rename it to __GFP_TAGGED in patch #18 ("arm64: mte: Rename
__GFP_ZEROTAGS to __GFP_TAGGED") [1].

[1] 
https://lore.kernel.org/lkml/20240125164256.4147-19-alexandru.eli...@arm.com/

Thanks,
Alex

> 
> > 
> > Signed-off-by: Alexandru Elisei 
> > ---
> > 
> > Changes since rfc v2:
> > 
> > * New patch.
> > 
> >  arch/s390/include/asm/page.h | 2 +-
> >  arch/s390/mm/page-states.c   | 2 +-
> >  include/linux/gfp.h  | 2 +-
> >  mm/page_alloc.c  | 2 +-
> >  4 files changed, 4 insertions(+), 4 deletions(-)
> > 
> > diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
> > index 73b9c3bf377f..859f0958c574 100644
> > --- a/arch/s390/include/asm/page.h
> > +++ b/arch/s390/include/asm/page.h
> > @@ -163,7 +163,7 @@ static inline int page_reset_referenced(unsigned long 
> > addr)
> >  
> >  struct page;
> >  void arch_free_page(struct page *page, int order);
> > -void arch_alloc_page(struct page *page, int order);
> > +void arch_alloc_page(struct page *page, int order, gfp_t gfp_flags);
> >  
> >  static inline int devmem_is_allowed(unsigned long pfn)
> >  {
> > diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c
> > index 01f9b39e65f5..b986c8b158e3 100644
> > --- a/arch/s390/mm/page-states.c
> > +++ b/arch/s390/mm/page-states.c
> > @@ -21,7 +21,7 @@ void arch_free_page(struct page *page, int order)
> > __set_page_unused(page_to_virt(page), 1UL << order);
> >  }
> >  
> > -void arch_alloc_page(struct page *page, int order)
> > +void arch_alloc_page(struct page *page, int order, gfp_t gfp_flags)
> >  {
> > if (!cmma_flag)
> > return;
> > diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> > index de292a007138..9e8aa3d144db 100644
> > --- a/include/linux/gfp.h
> > +++ b/include/linux/gfp.h
> > @@ -172,7 +172,7 @@ static inline struct zonelist *node_zonelist(int nid, 
> > gfp_t flags)
> >  static inline void arch_free_page(struct page *page, int order) { }
> >  #endif
> >  #ifndef HAVE_ARCH_ALLOC_PAGE
> > -static inline void arch_alloc_page(struct page *page, int order) { }
> > +static inline void arch_alloc_page(struct page *page, int order, gfp_t 
> > gfp_flags) { }
> >  #endif
> >  
> >  struct page *__alloc_pages(gfp_t gfp, unsigned int order, int 
> > preferred_nid,
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index 150d4f23b010..2c140abe5ee6 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -1485,7 +1485,7 @@ inline void post_alloc_hook(struct page *page, 
> > unsigned int order,
> > set_page_private(page, 0);
> > set_page_refcounted(page);
> >  
> > -   arch_alloc_page(page, order);
> > +   arch_alloc_page(page, order, gfp_flags);
> > debug_pagealloc_map_pages(page, 1 << order);
> >  
> > /*
> 
> Otherwise LGTM.



Re: [PATCH RFC v3 19/35] arm64: mte: Discover tag storage memory

2024-01-26 Thread Alexandru Elisei
Hi Krzysztof,

On Fri, Jan 26, 2024 at 09:50:58AM +0100, Krzysztof Kozlowski wrote:
> On 25/01/2024 17:42, Alexandru Elisei wrote:
> > Allow the kernel to get the base address, size, block size and associated
> > memory node for tag storage from the device tree blob.
> > 
> 
> Please use scripts/get_maintainers.pl to get a list of necessary people
> and lists to CC. It might happen, that command when run on an older
> kernel, gives you outdated entries. Therefore please be sure you base
> your patches on recent Linux kernel.
> 
> Tools like b4 or scripts_getmaintainer.pl provide you proper list of
> people, so fix your workflow. Tools might also fail if you work on some
> ancient tree (don't, use mainline), work on fork of kernel (don't, use
> mainline) or you ignore some maintainers (really don't). Just use b4 and
> all the problems go away.
> 
> You missed at least devicetree list (maybe more), so this won't be
> tested by automated tooling. Performing review on untested code might be
> a waste of time, thus I will skip this patch entirely till you follow
> the process allowing the patch to be tested.
> 
> Please kindly resend and include all necessary To/Cc entries.

My mistake, the previous iteration of the series didn't include a
devicetree binding and I forgot to update the To/Cc list. Thank you for the
heads-up, hopefully you can have a look after I resend the series.

> 
> 
> > A tag storage region represents the smallest contiguous memory region that
> > holds all the tags for the associated contiguous memory region which can be
> > tagged. For example, for a 32GB contiguous tagged memory the corresponding
> > tag storage region is exactly 1GB of contiguous memory, not two adjacent
> > 512M of tag storage memory, nor one 2GB tag storage region.
> > 
> > Tag storage is described as reserved memory; future patches will teach the
> > kernel how to make use of it for data (non-tagged) allocations.
> > 
> > Signed-off-by: Alexandru Elisei 
> > ---
> > 
> > Changes since rfc v2:
> > 
> > * Reworked from rfc v2 patch #11 ("arm64: mte: Reserve tag storage memory").
> > * Added device tree schema (Rob Herring)
> > * Tag storage memory is now described in the "reserved-memory" node (Rob
> > Herring).
> > 
> >  .../reserved-memory/arm,mte-tag-storage.yaml  |  78 +
> 
> Please run scripts/checkpatch.pl and fix reported warnings. Some
> warnings can be ignored, but the code here looks like it needs a fix.
> Feel free to get in touch if the warning is not clear.

Thank you for pointing it out, I'll move the binding to a separate patch.

Alex



[PATCH RFC v3 35/35] HACK! arm64: dts: Add fake tag storage to fvp-base-revc.dts

2024-01-25 Thread Alexandru Elisei
Faking a tag storage region for FVP is useful for testing.

Signed-off-by: Alexandru Elisei 
---

Changes since rfc v2:

* New patch, not intended to be merged.

 arch/arm64/boot/dts/arm/fvp-base-revc.dts | 42 +--
 1 file changed, 39 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/boot/dts/arm/fvp-base-revc.dts 
b/arch/arm64/boot/dts/arm/fvp-base-revc.dts
index 60472d65a355..e9f44420cb62 100644
--- a/arch/arm64/boot/dts/arm/fvp-base-revc.dts
+++ b/arch/arm64/boot/dts/arm/fvp-base-revc.dts
@@ -165,10 +165,30 @@ C1_L2: l2-cache1 {
};
};
 
-   memory@8000 {
+   memory0: memory@8000 {
device_type = "memory";
-   reg = <0x 0x8000 0 0x8000>,
- <0x0008 0x8000 0 0x8000>;
+   reg = <0x00 0x8000 0x00 0x8000>;
+   numa-node-id = <0x00>;
+   };
+
+   /* tags0 */
+   tags_memory0: memory@8f800 {
+   device_type = "memory";
+   reg = <0x08 0xf800 0x00 0x400>;
+   numa-node-id = <0x00>;
+   };
+
+   memory1: memory@88000 {
+   device_type = "memory";
+   reg = <0x08 0x8000 0x00 0x7800>;
+   numa-node-id = <0x01>;
+   };
+
+   /* tags1 */
+   tags_memory1: memory@8fc0 {
+   device_type = "memory";
+   reg = <0x08 0xfc00 0x00 0x3c0>;
+   numa-node-id = <0x01>;
};
 
reserved-memory {
@@ -183,6 +203,22 @@ vram: vram@1800 {
reg = <0x 0x1800 0 0x0080>;
no-map;
};
+
+   tags0: tag-storage@8f800 {
+   compatible = "arm,mte-tag-storage";
+   reg = <0x08 0xf800 0x00 0x400>;
+   block-size = <0x1000>;
+   tagged-memory = <>;
+   reusable;
+   };
+
+   tags1: tag-storage@8fc0 {
+   compatible = "arm,mte-tag-storage";
+   reg = <0x08 0xfc00 0x00 0x3c0>;
+   block-size = <0x1000>;
+   tagged-memory = <>;
+   reusable;
+   };
};
 
gic: interrupt-controller@2f00 {
-- 
2.43.0




[PATCH RFC v3 34/35] arm64: mte: Enable dynamic tag storage management

2024-01-25 Thread Alexandru Elisei
Everything is in place, enable tag storage management.

Signed-off-by: Alexandru Elisei 
---
 arch/arm64/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 088e30fc6d12..95c153705a2c 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -2084,7 +2084,7 @@ config ARM64_MTE
 
 if ARM64_MTE
 config ARM64_MTE_TAG_STORAGE
-   bool
+   bool "MTE tag storage management"
select ARCH_HAS_FAULT_ON_ACCESS
select CONFIG_CMA
help
-- 
2.43.0




[PATCH RFC v3 33/35] KVM: arm64: mte: Introduce VM_MTE_KVM VMA flag

2024-01-25 Thread Alexandru Elisei
Tag storage pages mapped by the host in a VM with MTE enabled are migrated
when they are first accessed by the guest. This introduces latency spikes
for memory accesses made by the guest.

Tag storage pages can be mapped in the guest memory when the VM_MTE VMA
flag is not set. Introduce a new VMA flag, VM_MTE_KVM, to stop tag storage
pages from being mapped in a VM with MTE enabled.

The flag is different from VM_MTE, because the pages from the VMA won't be
mapped as tagged in the host, and host's userspace can continue to access
the guest memory as Untagged. The flag's only function is to instruct the
page allocator to treat the allocation as tagged, so tag storage pages
aren't used. The page allocator will also try to reserve tag storage for
the new page, which can speed up stage 2 aborts further if the VMM has
accessed the memory before the guest. For example, qemu and kvmtool will
benefit from this change because the guest image is copied after the
memslot is created.

Signed-off-by: Alexandru Elisei 
---

Changes since rfc v2:

* New patch.

 arch/arm64/kvm/mmu.c  | 77 ++-
 arch/arm64/mm/fault.c |  2 +-
 include/linux/mm.h|  2 ++
 3 files changed, 79 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 986a9544228d..45c57c4b9fe2 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1420,7 +1420,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
unsigned long mmu_seq;
struct kvm *kvm = vcpu->kvm;
struct kvm_mmu_memory_cache *memcache = >arch.mmu_page_cache;
-   struct vm_area_struct *vma;
+   struct vm_area_struct *vma, *old_vma;
short vma_shift;
gfn_t gfn;
kvm_pfn_t pfn;
@@ -1428,6 +1428,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
long vma_pagesize, fault_granule;
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
struct kvm_pgtable *pgt;
+   bool vma_has_kvm_mte = false;
 
if (fault_is_perm)
fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
@@ -1506,6 +1507,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
 
gfn = fault_ipa >> PAGE_SHIFT;
mte_allowed = kvm_vma_mte_allowed(vma);
+   vma_has_kvm_mte = !!(vma->vm_flags & VM_MTE_KVM);
+   old_vma = vma;
 
/* Don't use the VMA after the unlock -- it may have vanished */
vma = NULL;
@@ -1521,6 +1524,27 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
mmu_seq = vcpu->kvm->mmu_invalidate_seq;
mmap_read_unlock(current->mm);
 
+   /*
+* If the VMA was created after the memslot, it doesn't have the
+* VM_MTE_KVM flag set.
+*/
+   if (unlikely(tag_storage_enabled() && !fault_is_perm &&
+   kvm_has_mte(kvm) && mte_allowed && !vma_has_kvm_mte)) {
+   mmap_write_lock(current->mm);
+   vma = vma_lookup(current->mm, hva);
+   /* The VMA was changed, replay the fault. */
+   if (vma != old_vma) {
+   mmap_write_unlock(current->mm);
+   return 0;
+   }
+   if (!(vma->vm_flags & VM_MTE_KVM)) {
+   vma_start_write(vma);
+   vm_flags_reset(vma, vma->vm_flags | VM_MTE_KVM);
+   }
+   vma = NULL;
+   mmap_write_unlock(current->mm);
+   }
+
pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL,
   write_fault, , NULL);
 
@@ -1986,6 +2010,40 @@ int __init kvm_mmu_init(u32 *hyp_va_bits)
return err;
 }
 
+static int kvm_set_clear_kvm_mte_vma(const struct kvm_memory_slot *memslot, 
bool set)
+{
+   struct vm_area_struct *vma;
+   hva_t hva, memslot_end;
+   int ret = 0;
+
+   hva = memslot->userspace_addr;
+   memslot_end = hva + (memslot->npages << PAGE_SHIFT);
+
+   mmap_write_lock(current->mm);
+
+   do {
+   vma = find_vma_intersection(current->mm, hva, memslot_end);
+   if (!vma)
+   break;
+   if (!kvm_vma_mte_allowed(vma))
+   continue;
+   if (set) {
+   if (!(vma->vm_flags & VM_MTE_KVM)) {
+   vma_start_write(vma);
+   vm_flags_reset(vma, vma->vm_flags | VM_MTE_KVM);
+   }
+   } else if (vma->vm_flags & VM_MTE_KVM) {
+   vma_start_write(vma);
+   vm_flags_reset(vma, vma->vm_flags & ~VM_MTE_KVM);
+   }
+   hva = min(memslot_end, vma->vm_end);
+   } while (hva < mems

[PATCH RFC v3 32/35] KVM: arm64: mte: Reserve tag storage for virtual machines with MTE

2024-01-25 Thread Alexandru Elisei
KVM allows MTE enabled VMs to be created when the backing VMA does not have
MTE enabled. As a result, pages allocated for the virtual machine's memory
won't have tag storage reserved. Try to reserve tag storage the first time
the page is accessed by the guest. This is similar to how pages mapped
without tag storage in an MTE VMA are handled.

Signed-off-by: Alexandru Elisei 
---

Changes since rfc v2:

* New patch.

 arch/arm64/include/asm/mte_tag_storage.h | 10 ++
 arch/arm64/include/asm/pgtable.h |  7 +++-
 arch/arm64/kvm/mmu.c | 43 
 arch/arm64/mm/fault.c|  2 +-
 4 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/mte_tag_storage.h 
b/arch/arm64/include/asm/mte_tag_storage.h
index 40590a8c3748..32940ef7bcdf 100644
--- a/arch/arm64/include/asm/mte_tag_storage.h
+++ b/arch/arm64/include/asm/mte_tag_storage.h
@@ -34,6 +34,8 @@ void free_tag_storage(struct page *page, int order);
 bool page_tag_storage_reserved(struct page *page);
 bool page_is_tag_storage(struct page *page);
 
+int replace_folio_with_tagged(struct folio *folio);
+
 vm_fault_t handle_folio_missing_tag_storage(struct folio *folio, struct 
vm_fault *vmf,
bool *map_pte);
 vm_fault_t mte_try_transfer_swap_tags(swp_entry_t entry, struct page *page);
@@ -67,6 +69,14 @@ static inline bool page_tag_storage_reserved(struct page 
*page)
 {
return true;
 }
+static inline bool page_is_tag_storage(struct page *page)
+{
+   return false;
+}
+static inline int replace_folio_with_tagged(struct folio *folio)
+{
+   return -EINVAL;
+}
 #endif /* CONFIG_ARM64_MTE_TAG_STORAGE */
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index d0473538c926..7f89606ad617 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1108,7 +1108,12 @@ static inline void arch_swap_restore(swp_entry_t entry, 
struct folio *folio)
 #define __HAVE_ARCH_FREE_PAGES_PREPARE
 static inline void arch_free_pages_prepare(struct page *page, int order)
 {
-   if (tag_storage_enabled() && page_mte_tagged(page))
+   /*
+* KVM can free a page after tag storage has been reserved and before is
+* marked as tagged, hence use page_tag_storage_reserved() instead of
+* page_mte_tagged() to check for tag storage.
+*/
+   if (tag_storage_enabled() && page_tag_storage_reserved(page))
free_tag_storage(page, order);
 }
 
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index b7517c4a19c4..986a9544228d 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1361,6 +1361,8 @@ static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t 
pfn,
if (!kvm_has_mte(kvm))
return;
 
+   WARN_ON_ONCE(tag_storage_enabled() && 
!page_tag_storage_reserved(pfn_to_page(pfn)));
+
for (i = 0; i < nr_pages; i++, page++) {
if (try_page_mte_tagging(page)) {
mte_clear_page_tags(page_address(page));
@@ -1374,6 +1376,39 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct 
*vma)
return vma->vm_flags & VM_MTE_ALLOWED;
 }
 
+/*
+ * Called with an elevated reference on the pfn. If successful, the reference
+ * count is not changed. If it returns an error, the elevated reference is
+ * dropped.
+ */
+static int kvm_mte_reserve_tag_storage(kvm_pfn_t pfn)
+{
+   struct folio *folio;
+   int ret;
+
+   folio = page_folio(pfn_to_page(pfn));
+
+   if (page_tag_storage_reserved(folio_page(folio, 0)))
+return 0;
+
+   if (page_is_tag_storage(folio_page(folio, 0)))
+   goto migrate;
+
+   ret = reserve_tag_storage(folio_page(folio, 0), folio_order(folio),
+ GFP_HIGHUSER_MOVABLE);
+   if (!ret)
+   return 0;
+
+migrate:
+   replace_folio_with_tagged(folio);
+   /*
+* If migration succeeds, the fault needs to be replayed because 'pfn'
+* has been unmapped. If migration fails, KVM will try to reserve tag
+* storage again by replaying the fault.
+*/
+   return -EAGAIN;
+}
+
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
  struct kvm_memory_slot *memslot, unsigned long hva,
  bool fault_is_perm)
@@ -1488,6 +1523,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
 
pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL,
   write_fault, , NULL);
+
if (pfn == KVM_PFN_ERR_HWPOISON) {
kvm_send_hwpoison_signal(hva, vma_shift);
return 0;
@@ -1518,6 +1554,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
if (exec_fault &&am

[PATCH RFC v3 31/35] khugepaged: arm64: Don't collapse MTE enabled VMAs

2024-01-25 Thread Alexandru Elisei
copy_user_highpage() will do memory allocation if there are saved tags for
the destination page, and the page is missing tag storage.

After commit a349d72fd9ef ("mm/pgtable: add rcu_read_lock() and
rcu_read_unlock()s"), collapse_huge_page() calls
__collapse_huge_page_copy() -> .. -> copy_user_highpage() with the RCU lock
held, which means that copy_user_highpage() can only allocate memory using
GFP_ATOMIC or equivalent.

Get around this by refusing to collapse pages into a transparent huge page
if the VMA is MTE-enabled.

Signed-off-by: Alexandru Elisei 
---

Changes since rfc v2:

* New patch. I think an agreement on whether copy*_user_highpage() should be
always allowed to sleep, or should not be allowed, would be useful.

 arch/arm64/include/asm/pgtable.h| 3 +++
 arch/arm64/kernel/mte_tag_storage.c | 5 +
 include/linux/khugepaged.h  | 5 +
 mm/khugepaged.c | 4 
 4 files changed, 17 insertions(+)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 87ae59436162..d0473538c926 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1120,6 +1120,9 @@ static inline bool arch_alloc_cma(gfp_t gfp_mask)
return true;
 }
 
+bool arch_hugepage_vma_revalidate(struct vm_area_struct *vma, unsigned long 
address);
+#define arch_hugepage_vma_revalidate arch_hugepage_vma_revalidate
+
 #endif /* CONFIG_ARM64_MTE_TAG_STORAGE */
 #endif /* CONFIG_ARM64_MTE */
 
diff --git a/arch/arm64/kernel/mte_tag_storage.c 
b/arch/arm64/kernel/mte_tag_storage.c
index ac7b9c9c585c..a99959b70573 100644
--- a/arch/arm64/kernel/mte_tag_storage.c
+++ b/arch/arm64/kernel/mte_tag_storage.c
@@ -636,3 +636,8 @@ void arch_alloc_page(struct page *page, int order, gfp_t 
gfp)
if (tag_storage_enabled() && alloc_requires_tag_storage(gfp))
reserve_tag_storage(page, order, gfp);
 }
+
+bool arch_hugepage_vma_revalidate(struct vm_area_struct *vma, unsigned long 
address)
+{
+   return !(vma->vm_flags & VM_MTE);
+}
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index f68865e19b0b..461e4322dff2 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -38,6 +38,11 @@ static inline void khugepaged_exit(struct mm_struct *mm)
if (test_bit(MMF_VM_HUGEPAGE, >flags))
__khugepaged_exit(mm);
 }
+
+#ifndef arch_hugepage_vma_revalidate
+#define arch_hugepage_vma_revalidate(vma, address) 1
+#endif
+
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
 static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct 
*oldmm)
 {
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 2b219acb528e..cb9a9ddb4d86 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -935,6 +935,10 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, 
unsigned long address,
 */
if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap)))
return SCAN_PAGE_ANON;
+
+   if (!arch_hugepage_vma_revalidate(vma, address))
+   return SCAN_VMA_CHECK;
+
return SCAN_SUCCEED;
 }
 
-- 
2.43.0




[PATCH RFC v3 30/35] arm64: mte: ptrace: Handle pages with missing tag storage

2024-01-25 Thread Alexandru Elisei
A page can end up mapped in a MTE enabled VMA without the corresponding tag
storage block reserved. Tag accesses made by ptrace in this case can lead
to the wrong tags being read or memory corruption for the process that is
using the tag storage memory as data.

Reserve tag storage by treating ptrace accesses like a fault.

Signed-off-by: Alexandru Elisei 
---

Changes since rfc v2:

* New patch, issue reported by Peter Collingbourne.

 arch/arm64/kernel/mte.c | 26 --
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
index faf09da3400a..b1fa02dad4fd 100644
--- a/arch/arm64/kernel/mte.c
+++ b/arch/arm64/kernel/mte.c
@@ -412,10 +412,13 @@ static int __access_remote_tags(struct mm_struct *mm, 
unsigned long addr,
while (len) {
struct vm_area_struct *vma;
unsigned long tags, offset;
+   unsigned int fault_flags;
+   struct page *page;
+   vm_fault_t ret;
void *maddr;
-   struct page *page = get_user_page_vma_remote(mm, addr,
-gup_flags, );
 
+get_page:
+   page = get_user_page_vma_remote(mm, addr, gup_flags, );
if (IS_ERR(page)) {
err = PTR_ERR(page);
break;
@@ -433,6 +436,25 @@ static int __access_remote_tags(struct mm_struct *mm, 
unsigned long addr,
put_page(page);
break;
}
+
+   if (tag_storage_enabled() && !page_tag_storage_reserved(page)) {
+   fault_flags = FAULT_FLAG_DEFAULT | \
+ FAULT_FLAG_USER | \
+ FAULT_FLAG_REMOTE | \
+ FAULT_FLAG_ALLOW_RETRY | \
+ FAULT_FLAG_RETRY_NOWAIT;
+   if (write)
+   fault_flags |= FAULT_FLAG_WRITE;
+
+   put_page(page);
+   ret = handle_mm_fault(vma, addr, fault_flags, NULL);
+   if (ret & VM_FAULT_ERROR) {
+   err = -EFAULT;
+   break;
+   }
+   goto get_page;
+   }
+
WARN_ON_ONCE(!page_mte_tagged(page));
 
/* limit access to the end of the page */
-- 
2.43.0




[PATCH RFC v3 29/35] arm64: mte: copypage: Handle tag restoring when missing tag storage

2024-01-25 Thread Alexandru Elisei
There are several situations where copy_highpage() can end up copying
tags to a page which doesn't have its tag storage reserved.

One situation involves migration racing with mprotect(PROT_MTE): VMA is
initially untagged, migration starts and destination page is allocated
as untagged, mprotect(PROT_MTE) changes the VMA to tagged and userspace
accesses the source page, thus making it tagged.  The migration code
then calls copy_highpage(), which will copy the tags from the source
page (now tagged) to the destination page (allocated as untagged).

Yes another situation can happen during THP collapse. The huge page that
will replace the HPAGE_PMD_NR contiguous mapped pages is allocated with
__GFP_TAGGED not set. copy_highpage() will copy the tags from the pages
being replaced to the huge page which doesn't have tag storage reserved.

The situation gets even more complicated when the replacement huge page
is a tag storage page. The tag storage huge page will be migrated after
a fault on access, but the tags from the original pages must be copied
over to the huge page that will be replacing the tag storage huge page.

Signed-off-by: Alexandru Elisei 
---
 arch/arm64/mm/copypage.c | 56 
 1 file changed, 56 insertions(+)

diff --git a/arch/arm64/mm/copypage.c b/arch/arm64/mm/copypage.c
index a7bb20055ce0..e991ccb43fb7 100644
--- a/arch/arm64/mm/copypage.c
+++ b/arch/arm64/mm/copypage.c
@@ -13,6 +13,59 @@
 #include 
 #include 
 #include 
+#include 
+
+#ifdef CONFIG_ARM64_MTE_TAG_STORAGE
+static inline bool try_transfer_saved_tags(struct page *from, struct page *to)
+{
+   void *tags;
+   bool saved;
+
+   VM_WARN_ON_ONCE(!preemptible());
+
+   if (page_mte_tagged(from)) {
+   if (page_tag_storage_reserved(to))
+   return false;
+
+   tags = mte_allocate_tag_buf();
+   if (WARN_ON(!tags))
+   return true;
+
+   mte_copy_page_tags_to_buf(page_address(from), tags);
+   saved = mte_save_tags_for_pfn(tags, page_to_pfn(to));
+   if (!saved)
+   mte_free_tag_buf(tags);
+
+   return saved;
+   }
+
+   tags_by_pfn_lock();
+   tags = mte_erase_tags_for_pfn(page_to_pfn(from));
+   tags_by_pfn_unlock();
+
+   if (likely(!tags))
+   return false;
+
+   if (page_tag_storage_reserved(to)) {
+   WARN_ON_ONCE(!try_page_mte_tagging(to));
+   mte_copy_page_tags_from_buf(page_address(to), tags);
+   set_page_mte_tagged(to);
+   mte_free_tag_buf(tags);
+   return true;
+   }
+
+   saved = mte_save_tags_for_pfn(tags, page_to_pfn(to));
+   if (!saved)
+   mte_free_tag_buf(tags);
+
+   return saved;
+}
+#else
+static inline bool try_transfer_saved_tags(struct page *from, struct page *to)
+{
+   return false;
+}
+#endif
 
 void copy_highpage(struct page *to, struct page *from)
 {
@@ -24,6 +77,9 @@ void copy_highpage(struct page *to, struct page *from)
if (kasan_hw_tags_enabled())
page_kasan_tag_reset(to);
 
+   if (tag_storage_enabled() && try_transfer_saved_tags(from, to))
+   return;
+
if (system_supports_mte() && page_mte_tagged(from)) {
/* It's a new page, shouldn't have been tagged yet */
WARN_ON_ONCE(!try_page_mte_tagging(to));
-- 
2.43.0




[PATCH RFC v3 28/35] arm64: mte: swap: Handle tag restoring when missing tag storage

2024-01-25 Thread Alexandru Elisei
Linux restores tags when a page is swapped in and there are tags associated
with the swap entry which the new page will replace. The saved tags are
restored even if the page will not be mapped as tagged, to protect against
cases where the page is shared between different VMAs, and is tagged in
some, but untagged in others. By using this approach, the process can still
access the correct tags following an mprotect(PROT_MTE) on the non-MTE
enabled VMA.

But this poses a challenge for managing tag storage: in the scenario above,
when a new page is allocated to be swapped in for the process where it will
be mapped as untagged, the corresponding tag storage block is not reserved.
mte_restore_page_tags_by_swp_entry(), when it restores the saved tags, will
overwrite data in the tag storage block associated with the new page,
leading to data corruption if the block is in use by a process.

Get around this issue by saving the tags in a new xarray, this time indexed
by the page pfn, and then restoring them when tag storage is reserved for
the page.

Signed-off-by: Alexandru Elisei 
---

Changes since rfc v2:

* Restore saved tags **before** setting the PG_tag_storage_reserved bit to
eliminate a brief window of opportunity where userspace can access uninitialized
tags (Peter Collingbourne).

 arch/arm64/include/asm/mte_tag_storage.h |   8 ++
 arch/arm64/include/asm/pgtable.h |  11 +++
 arch/arm64/kernel/mte_tag_storage.c  |  12 ++-
 arch/arm64/mm/mteswap.c  | 110 +++
 4 files changed, 140 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/mte_tag_storage.h 
b/arch/arm64/include/asm/mte_tag_storage.h
index 50bdae94cf71..40590a8c3748 100644
--- a/arch/arm64/include/asm/mte_tag_storage.h
+++ b/arch/arm64/include/asm/mte_tag_storage.h
@@ -36,6 +36,14 @@ bool page_is_tag_storage(struct page *page);
 
 vm_fault_t handle_folio_missing_tag_storage(struct folio *folio, struct 
vm_fault *vmf,
bool *map_pte);
+vm_fault_t mte_try_transfer_swap_tags(swp_entry_t entry, struct page *page);
+
+void tags_by_pfn_lock(void);
+void tags_by_pfn_unlock(void);
+
+void *mte_erase_tags_for_pfn(unsigned long pfn);
+bool mte_save_tags_for_pfn(void *tags, unsigned long pfn);
+void mte_restore_tags_for_pfn(unsigned long start_pfn, int order);
 #else
 static inline bool tag_storage_enabled(void)
 {
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 0174e292f890..87ae59436162 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1085,6 +1085,17 @@ static inline void arch_swap_invalidate_area(int type)
mte_invalidate_tags_area_by_swp_entry(type);
 }
 
+#ifdef CONFIG_ARM64_MTE_TAG_STORAGE
+#define __HAVE_ARCH_SWAP_PREPARE_TO_RESTORE
+static inline vm_fault_t arch_swap_prepare_to_restore(swp_entry_t entry,
+ struct folio *folio)
+{
+   if (tag_storage_enabled())
+   return mte_try_transfer_swap_tags(entry, >page);
+   return 0;
+}
+#endif
+
 #define __HAVE_ARCH_SWAP_RESTORE
 static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
 {
diff --git a/arch/arm64/kernel/mte_tag_storage.c 
b/arch/arm64/kernel/mte_tag_storage.c
index afe2bb754879..ac7b9c9c585c 100644
--- a/arch/arm64/kernel/mte_tag_storage.c
+++ b/arch/arm64/kernel/mte_tag_storage.c
@@ -567,6 +567,7 @@ int reserve_tag_storage(struct page *page, int order, gfp_t 
gfp)
}
}
 
+   mte_restore_tags_for_pfn(page_to_pfn(page), order);
page_set_tag_storage_reserved(page, order);
 out_unlock:
mutex_unlock(_blocks_lock);
@@ -595,7 +596,8 @@ void free_tag_storage(struct page *page, int order)
struct tag_region *region;
unsigned long page_va;
unsigned long flags;
-   int ret;
+   void *tags;
+   int i, ret;
 
ret = tag_storage_find_block(page, _block, );
if (WARN_ONCE(ret, "Missing tag storage block for pfn 0x%lx", 
page_to_pfn(page)))
@@ -605,6 +607,14 @@ void free_tag_storage(struct page *page, int order)
/* Avoid writeback of dirty tag cache lines corrupting data. */
dcache_inval_tags_poc(page_va, page_va + (PAGE_SIZE << order));
 
+   tags_by_pfn_lock();
+   for (i = 0; i < (1 << order); i++) {
+   tags = mte_erase_tags_for_pfn(page_to_pfn(page + i));
+   if (unlikely(tags))
+   mte_free_tag_buf(tags);
+   }
+   tags_by_pfn_unlock();
+
end_block = start_block + order_to_num_blocks(order, 
region->block_size_pages);
 
xa_lock_irqsave(_blocks_reserved, flags);
diff --git a/arch/arm64/mm/mteswap.c b/arch/arm64/mm/mteswap.c
index 2a43746b803f..e11495fa3c18 100644
--- a/arch/arm64/mm/mteswap.c
+++ b/arch/arm64/mm/mteswap.c
@@ -20,6 +20,112 @@ void mte_free_tag_buf(void *buf)
kfree(buf);
 }
 
+#ifd

[PATCH RFC v3 27/35] arm64: mte: Handle tag storage pages mapped in an MTE VMA

2024-01-25 Thread Alexandru Elisei
Tag stoarge pages cannot be tagged. When such a page is mapped in a
MTE-enabled VMA, migrate it out directly and don't try to reserve tag
storage for it.

Signed-off-by: Alexandru Elisei 
---
 arch/arm64/include/asm/mte_tag_storage.h |  1 +
 arch/arm64/kernel/mte_tag_storage.c  | 15 +++
 arch/arm64/mm/fault.c| 11 +--
 3 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/mte_tag_storage.h 
b/arch/arm64/include/asm/mte_tag_storage.h
index 6d0f6ffcfdd6..50bdae94cf71 100644
--- a/arch/arm64/include/asm/mte_tag_storage.h
+++ b/arch/arm64/include/asm/mte_tag_storage.h
@@ -32,6 +32,7 @@ int reserve_tag_storage(struct page *page, int order, gfp_t 
gfp);
 void free_tag_storage(struct page *page, int order);
 
 bool page_tag_storage_reserved(struct page *page);
+bool page_is_tag_storage(struct page *page);
 
 vm_fault_t handle_folio_missing_tag_storage(struct folio *folio, struct 
vm_fault *vmf,
bool *map_pte);
diff --git a/arch/arm64/kernel/mte_tag_storage.c 
b/arch/arm64/kernel/mte_tag_storage.c
index 1c8469781870..afe2bb754879 100644
--- a/arch/arm64/kernel/mte_tag_storage.c
+++ b/arch/arm64/kernel/mte_tag_storage.c
@@ -492,6 +492,21 @@ bool page_tag_storage_reserved(struct page *page)
return test_bit(PG_tag_storage_reserved, >flags);
 }
 
+bool page_is_tag_storage(struct page *page)
+{
+   unsigned long pfn = page_to_pfn(page);
+   struct range *tag_range;
+   int i;
+
+   for (i = 0; i < num_tag_regions; i++) {
+   tag_range = _regions[i].tag_range;
+   if (tag_range->start <= pfn && pfn <= tag_range->end)
+   return true;
+   }
+
+   return false;
+}
+
 int reserve_tag_storage(struct page *page, int order, gfp_t gfp)
 {
unsigned long start_block, end_block;
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 1db3adb6499f..01450ab91a87 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -1014,6 +1014,7 @@ static int replace_folio_with_tagged(struct folio *folio)
 vm_fault_t handle_folio_missing_tag_storage(struct folio *folio, struct 
vm_fault *vmf,
bool *map_pte)
 {
+   bool is_tag_storage = page_is_tag_storage(folio_page(folio, 0));
struct vm_area_struct *vma = vmf->vma;
int ret = 0;
 
@@ -1033,12 +1034,18 @@ vm_fault_t handle_folio_missing_tag_storage(struct 
folio *folio, struct vm_fault
if (unlikely(is_migrate_isolate_page(folio_page(folio, 0
goto out_retry;
 
-   ret = reserve_tag_storage(folio_page(folio, 0), folio_order(folio), 
GFP_HIGHUSER_MOVABLE);
-   if (ret) {
+   if (!is_tag_storage) {
+   ret = reserve_tag_storage(folio_page(folio, 0), 
folio_order(folio),
+ GFP_HIGHUSER_MOVABLE);
+   if (!ret)
+   goto out_map;
+
/* replace_folio_with_tagged() is expensive, try to avoid it. */
if (fault_flag_allow_retry_first(vmf->flags))
goto out_retry;
+   }
 
+   if (ret || is_tag_storage) {
replace_folio_with_tagged(folio);
return 0;
}
-- 
2.43.0




[PATCH RFC v3 26/35] arm64: mte: Use fault-on-access to reserve missing tag storage

2024-01-25 Thread Alexandru Elisei
There are three situations in which a page that is to be mapped as
tagged doesn't have the corresponding tag storage reserved:

* reserve_tag_storage() failed.

* The allocation didn't specifiy __GFP_TAGGED (this can happen during
  migration, for example).

* The page was mapped in a non-MTE enabled VMA, then an mprotect(PROT_MTE)
  enabled MTE.

If a page that is about to be mapped as tagged doesn't have tag storage
reserved, map it with the PAGE_FAULT_ON_ACCESS protection to trigger a
fault next time they are accessed, and then reserve tag storage when the
fault is handled. If tag storage cannot be reserved, then the page is
migrated out of the VMA.

Tag storage pages (which cannot be tagged) mapped in an MTE enabled MTE
will be handled in a subsequent patch.

Signed-off-by: Alexandru Elisei 
---

Changes since rfc v2:

* New patch, loosely based on the arm64 code from the rfc v2 patch  #19 ("mm:
mprotect: Introduce PAGE_FAULT_ON_ACCESS for mprotect(PROT_MTE)")
* All the common code has been moved back to the arch independent function
handle_{huge_pmd,pte}_protnone() (David Hildenbrand).
* Page is migrated if tag storage cannot be reserved after exhausting all
attempts (Hyesoo Yu).
* Moved folio_isolate_lru() declaration and struct migration_target_control to
headers in include/linux (Peter Collingbourne).

 arch/arm64/Kconfig   |  1 +
 arch/arm64/include/asm/mte.h |  4 +-
 arch/arm64/include/asm/mte_tag_storage.h |  3 +
 arch/arm64/include/asm/pgtable-prot.h|  2 +
 arch/arm64/include/asm/pgtable.h | 44 ---
 arch/arm64/kernel/mte.c  | 11 ++-
 arch/arm64/mm/fault.c| 98 
 include/linux/memcontrol.h   |  2 +
 include/linux/migrate.h  |  8 +-
 include/linux/migrate_mode.h |  1 +
 mm/internal.h|  6 --
 11 files changed, 156 insertions(+), 24 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 6f65e9005dc9..088e30fc6d12 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -2085,6 +2085,7 @@ config ARM64_MTE
 if ARM64_MTE
 config ARM64_MTE_TAG_STORAGE
bool
+   select ARCH_HAS_FAULT_ON_ACCESS
select CONFIG_CMA
help
  Adds support for dynamic management of the memory used by the hardware
diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h
index 6457b7899207..70dc2e409070 100644
--- a/arch/arm64/include/asm/mte.h
+++ b/arch/arm64/include/asm/mte.h
@@ -107,7 +107,7 @@ static inline bool try_page_mte_tagging(struct page *page)
 }
 
 void mte_zero_clear_page_tags(void *addr);
-void mte_sync_tags(pte_t pte, unsigned int nr_pages);
+void mte_sync_tags(pte_t *pteval, unsigned int nr_pages);
 void mte_copy_page_tags(void *kto, const void *kfrom);
 void mte_thread_init_user(void);
 void mte_thread_switch(struct task_struct *next);
@@ -139,7 +139,7 @@ static inline bool try_page_mte_tagging(struct page *page)
 static inline void mte_zero_clear_page_tags(void *addr)
 {
 }
-static inline void mte_sync_tags(pte_t pte, unsigned int nr_pages)
+static inline void mte_sync_tags(pte_t *pteval, unsigned int nr_pages)
 {
 }
 static inline void mte_copy_page_tags(void *kto, const void *kfrom)
diff --git a/arch/arm64/include/asm/mte_tag_storage.h 
b/arch/arm64/include/asm/mte_tag_storage.h
index 423b19e0cc46..6d0f6ffcfdd6 100644
--- a/arch/arm64/include/asm/mte_tag_storage.h
+++ b/arch/arm64/include/asm/mte_tag_storage.h
@@ -32,6 +32,9 @@ int reserve_tag_storage(struct page *page, int order, gfp_t 
gfp);
 void free_tag_storage(struct page *page, int order);
 
 bool page_tag_storage_reserved(struct page *page);
+
+vm_fault_t handle_folio_missing_tag_storage(struct folio *folio, struct 
vm_fault *vmf,
+   bool *map_pte);
 #else
 static inline bool tag_storage_enabled(void)
 {
diff --git a/arch/arm64/include/asm/pgtable-prot.h 
b/arch/arm64/include/asm/pgtable-prot.h
index 483dbfa39c4c..1820e29244f8 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -19,6 +19,7 @@
 #define PTE_SPECIAL(_AT(pteval_t, 1) << 56)
 #define PTE_DEVMAP (_AT(pteval_t, 1) << 57)
 #define PTE_PROT_NONE  (_AT(pteval_t, 1) << 58) /* only when 
!PTE_VALID */
+#define PTE_TAG_STORAGE_NONE   (_AT(pteval_t, 1) << 60) /* only when 
PTE_PROT_NONE */
 
 /*
  * This bit indicates that the entry is present i.e. pmd_page()
@@ -96,6 +97,7 @@ extern bool arm64_use_ng_mappings;
 })
 
 #define PAGE_NONE  __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | 
PTE_PROT_NONE | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN)
+#define PAGE_FAULT_ON_ACCESS   __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | 
PTE_PROT_NONE | PTE_TAG_STORAGE_NONE | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN)
 /* shared+writable pages are clean by default, hence PTE_RDONLY|PTE_WRITE *

[PATCH RFC v3 25/35] arm64: mte: Reserve tag block for the zero page

2024-01-25 Thread Alexandru Elisei
On arm64, when a page is mapped as tagged, its tags are zeroed for two
reasons:

* To prevent leakage of tags to userspace.

* To allow userspace to access the contents of the page with having to set
  the tags explicitely (bits 59:56 of an userspace pointer are zero, which
  correspond to tag 0b).

The zero page receives special treatment, as the tags for the zero page are
zeroed when the MTE feature is being enabled. This is done for performance
reasons - the tags are zeroed once, instead of every time the page is
mapped.

When the tags for the zero page are zeroed, tag storage is not yet enabled.
Reserve tag storage for the page immediately after tag storage management
becomes enabled.

Note that zeroing tags before tag storage management is enabled is safe to
do because the tag storage pages are reserved at that point.

Signed-off-by: Alexandru Elisei 
---

Changes since rfc v2:

* Expanded commit message (David Hildenbrand)

 arch/arm64/kernel/mte_tag_storage.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/kernel/mte_tag_storage.c 
b/arch/arm64/kernel/mte_tag_storage.c
index 8c347f4855e4..1c8469781870 100644
--- a/arch/arm64/kernel/mte_tag_storage.c
+++ b/arch/arm64/kernel/mte_tag_storage.c
@@ -363,6 +363,8 @@ static int __init mte_enable_tag_storage(void)
goto out_disabled;
}
 
+   reserve_tag_storage(ZERO_PAGE(0), 0, GFP_HIGHUSER);
+
static_branch_enable(_storage_enabled_key);
pr_info("MTE tag storage region management enabled");
 
-- 
2.43.0




[PATCH RFC v3 24/35] arm64: mte: Perform CMOs for tag blocks

2024-01-25 Thread Alexandru Elisei
Make sure the contents of the tag storage block is not corrupted by
performing:

1. A tag dcache inval when the associated tagged pages are freed, to avoid
   dirty tag cache lines being evicted and corrupting the tag storage
   block when it's being used to store data.

2. A data cache inval when the tag storage block is being reserved, to
   ensure that no dirty data cache lines are present, which would
   trigger a writeback that could corrupt the tags stored in the block.

Signed-off-by: Alexandru Elisei 
---
 arch/arm64/include/asm/assembler.h   | 10 ++
 arch/arm64/include/asm/mte_tag_storage.h |  2 ++
 arch/arm64/kernel/mte_tag_storage.c  | 11 +++
 arch/arm64/lib/mte.S | 16 
 4 files changed, 39 insertions(+)

diff --git a/arch/arm64/include/asm/assembler.h 
b/arch/arm64/include/asm/assembler.h
index 513787e43329..65fe88cce72b 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -310,6 +310,16 @@ alternative_cb_end
lsl \reg, \reg, \tmp// actual cache line size
.endm
 
+/*
+ * tcache_line_size - get the safe tag cache line size across all CPUs
+ */
+   .macro  tcache_line_size, reg, tmp
+   read_ctr\tmp
+   ubfm\tmp, \tmp, #32, #37// tag cache line size encoding
+   mov \reg, #4// bytes per word
+   lsl \reg, \reg, \tmp// actual tag cache line size
+   .endm
+
 /*
  * raw_icache_line_size - get the minimum I-cache line size on this CPU
  * from the CTR register.
diff --git a/arch/arm64/include/asm/mte_tag_storage.h 
b/arch/arm64/include/asm/mte_tag_storage.h
index 09f1318d924e..423b19e0cc46 100644
--- a/arch/arm64/include/asm/mte_tag_storage.h
+++ b/arch/arm64/include/asm/mte_tag_storage.h
@@ -11,6 +11,8 @@
 
 #include 
 
+extern void dcache_inval_tags_poc(unsigned long start, unsigned long end);
+
 #ifdef CONFIG_ARM64_MTE_TAG_STORAGE
 
 DECLARE_STATIC_KEY_FALSE(tag_storage_enabled_key);
diff --git a/arch/arm64/kernel/mte_tag_storage.c 
b/arch/arm64/kernel/mte_tag_storage.c
index 762c7c803a70..8c347f4855e4 100644
--- a/arch/arm64/kernel/mte_tag_storage.c
+++ b/arch/arm64/kernel/mte_tag_storage.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 
+#include 
 #include 
 
 __ro_after_init DEFINE_STATIC_KEY_FALSE(tag_storage_enabled_key);
@@ -421,8 +422,13 @@ static bool tag_storage_block_is_reserved(unsigned long 
block)
 
 static int tag_storage_reserve_block(unsigned long block, struct tag_region 
*region, int order)
 {
+   unsigned long block_va;
int ret;
 
+   block_va = (unsigned long)page_to_virt(pfn_to_page(block));
+   /* Avoid writeback of dirty data cache lines corrupting tags. */
+   dcache_inval_poc(block_va, block_va + region->block_size_pages * 
PAGE_SIZE);
+
ret = xa_err(xa_store(_blocks_reserved, block, pfn_to_page(block), 
GFP_KERNEL));
if (!ret)
block_ref_add(block, region, order);
@@ -570,6 +576,7 @@ void free_tag_storage(struct page *page, int order)
 {
unsigned long block, start_block, end_block;
struct tag_region *region;
+   unsigned long page_va;
unsigned long flags;
int ret;
 
@@ -577,6 +584,10 @@ void free_tag_storage(struct page *page, int order)
if (WARN_ONCE(ret, "Missing tag storage block for pfn 0x%lx", 
page_to_pfn(page)))
return;
 
+   page_va = (unsigned long)page_to_virt(page);
+   /* Avoid writeback of dirty tag cache lines corrupting data. */
+   dcache_inval_tags_poc(page_va, page_va + (PAGE_SIZE << order));
+
end_block = start_block + order_to_num_blocks(order, 
region->block_size_pages);
 
xa_lock_irqsave(_blocks_reserved, flags);
diff --git a/arch/arm64/lib/mte.S b/arch/arm64/lib/mte.S
index 9f623e9da09f..bc02b4e95062 100644
--- a/arch/arm64/lib/mte.S
+++ b/arch/arm64/lib/mte.S
@@ -175,3 +175,19 @@ SYM_FUNC_START(mte_copy_page_tags_from_buf)
 
ret
 SYM_FUNC_END(mte_copy_page_tags_from_buf)
+
+/*
+ * dcache_inval_tags_poc(start, end)
+ *
+ * Ensure that any tags in the D-cache for the interval [start, end)
+ * are invalidated to PoC.
+ *
+ * - start   - virtual start address of region
+ * - end - virtual end address of region
+ */
+SYM_FUNC_START(__pi_dcache_inval_tags_poc)
+   tcache_line_size x2, x3
+   dcache_by_myline_op igvac, sy, x0, x1, x2, x3
+   ret
+SYM_FUNC_END(__pi_dcache_inval_tags_poc)
+SYM_FUNC_ALIAS(dcache_inval_tags_poc, __pi_dcache_inval_tags_poc)
-- 
2.43.0




[PATCH RFC v3 23/35] arm64: mte: Try to reserve tag storage in arch_alloc_page()

2024-01-25 Thread Alexandru Elisei
Reserve tag storage for a page that is being allocated as tagged. This
is a best effort approach, and failing to reserve tag storage is
allowed.

When all the associated tagged pages have been freed, return the tag
storage pages back to the page allocator, where they can be used again for
data allocations.

Signed-off-by: Alexandru Elisei 
---

Changes since rfc v2:

* Based on rfc v2 patch #16 ("arm64: mte: Manage tag storage on page
allocation").
* Fixed calculation of the number of associated tag storage blocks (Hyesoo
Yu).
* Tag storage is reserved in arch_alloc_page() instead of
arch_prep_new_page().

 arch/arm64/include/asm/mte.h |  16 +-
 arch/arm64/include/asm/mte_tag_storage.h |  31 +++
 arch/arm64/include/asm/page.h|   5 +
 arch/arm64/include/asm/pgtable.h |  19 ++
 arch/arm64/kernel/mte_tag_storage.c  | 234 +++
 arch/arm64/mm/fault.c|   7 +
 fs/proc/page.c   |   1 +
 include/linux/kernel-page-flags.h|   1 +
 include/linux/page-flags.h   |   1 +
 include/trace/events/mmflags.h   |   3 +-
 mm/huge_memory.c |   1 +
 11 files changed, 316 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h
index 8034695b3dd7..6457b7899207 100644
--- a/arch/arm64/include/asm/mte.h
+++ b/arch/arm64/include/asm/mte.h
@@ -40,12 +40,24 @@ void mte_free_tag_buf(void *buf);
 #ifdef CONFIG_ARM64_MTE
 
 /* track which pages have valid allocation tags */
-#define PG_mte_tagged  PG_arch_2
+#define PG_mte_tagged  PG_arch_2
 /* simple lock to avoid multiple threads tagging the same page */
-#define PG_mte_lockPG_arch_3
+#define PG_mte_lockPG_arch_3
+/* Track if a tagged page has tag storage reserved */
+#define PG_tag_storage_reservedPG_arch_4
+
+#ifdef CONFIG_ARM64_MTE_TAG_STORAGE
+DECLARE_STATIC_KEY_FALSE(tag_storage_enabled_key);
+extern bool page_tag_storage_reserved(struct page *page);
+#endif
 
 static inline void set_page_mte_tagged(struct page *page)
 {
+#ifdef CONFIG_ARM64_MTE_TAG_STORAGE
+   /* Open code mte_tag_storage_enabled() */
+   WARN_ON_ONCE(static_branch_likely(_storage_enabled_key) &&
+!page_tag_storage_reserved(page));
+#endif
/*
 * Ensure that the tags written prior to this function are visible
 * before the page flags update.
diff --git a/arch/arm64/include/asm/mte_tag_storage.h 
b/arch/arm64/include/asm/mte_tag_storage.h
index 7b3f6bff8e6f..09f1318d924e 100644
--- a/arch/arm64/include/asm/mte_tag_storage.h
+++ b/arch/arm64/include/asm/mte_tag_storage.h
@@ -5,6 +5,12 @@
 #ifndef __ASM_MTE_TAG_STORAGE_H
 #define __ASM_MTE_TAG_STORAGE_H
 
+#ifndef __ASSEMBLY__
+
+#include 
+
+#include 
+
 #ifdef CONFIG_ARM64_MTE_TAG_STORAGE
 
 DECLARE_STATIC_KEY_FALSE(tag_storage_enabled_key);
@@ -15,6 +21,15 @@ static inline bool tag_storage_enabled(void)
 }
 
 void mte_init_tag_storage(void);
+
+static inline bool alloc_requires_tag_storage(gfp_t gfp)
+{
+   return gfp & __GFP_TAGGED;
+}
+int reserve_tag_storage(struct page *page, int order, gfp_t gfp);
+void free_tag_storage(struct page *page, int order);
+
+bool page_tag_storage_reserved(struct page *page);
 #else
 static inline bool tag_storage_enabled(void)
 {
@@ -23,6 +38,22 @@ static inline bool tag_storage_enabled(void)
 static inline void mte_init_tag_storage(void)
 {
 }
+static inline bool alloc_requires_tag_storage(struct page *page)
+{
+   return false;
+}
+static inline int reserve_tag_storage(struct page *page, int order, gfp_t gfp)
+{
+   return 0;
+}
+static inline void free_tag_storage(struct page *page, int order)
+{
+}
+static inline bool page_tag_storage_reserved(struct page *page)
+{
+   return true;
+}
 #endif /* CONFIG_ARM64_MTE_TAG_STORAGE */
 
+#endif /* !__ASSEMBLY__ */
 #endif /* __ASM_MTE_TAG_STORAGE_H  */
diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h
index 88bab032a493..3a656492f34a 100644
--- a/arch/arm64/include/asm/page.h
+++ b/arch/arm64/include/asm/page.h
@@ -35,6 +35,11 @@ void copy_highpage(struct page *to, struct page *from);
 void tag_clear_highpage(struct page *to);
 #define __HAVE_ARCH_TAG_CLEAR_HIGHPAGE
 
+#ifdef CONFIG_ARM64_MTE_TAG_STORAGE
+void arch_alloc_page(struct page *, int order, gfp_t gfp);
+#define HAVE_ARCH_ALLOC_PAGE
+#endif
+
 #define clear_user_page(page, vaddr, pg)   clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)copy_page(to, from)
 
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 2499cc4fa4f2..f30466199a9b 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -10,6 +10,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1069,6 +1070,24 @@ static inline void arch_swap_restore(swp_entry_t entry, 
struct folio *folio)
mte_restore_page_t

[PATCH RFC v3 22/35] arm64: mte: Enable tag storage if CMA areas have been activated

2024-01-25 Thread Alexandru Elisei
Before enabling MTE tag storage management, make sure that the CMA areas
have been successfully activated. If a CMA area fails activation, the pages
are kept as reserved. Reserved pages are never used by the page allocator.

If this happens, the kernel would have to manage tag storage only for some
of the memory, but not for all memory, and that would make the code
unreasonably complicated.

Choose to disable tag storage management altogether if a CMA area fails to
be activated.

Signed-off-by: Alexandru Elisei 
---

Changes since v2:

* New patch.

 arch/arm64/include/asm/mte_tag_storage.h | 12 ++
 arch/arm64/kernel/mte_tag_storage.c  | 50 
 2 files changed, 62 insertions(+)

diff --git a/arch/arm64/include/asm/mte_tag_storage.h 
b/arch/arm64/include/asm/mte_tag_storage.h
index 3c2cd29e053e..7b3f6bff8e6f 100644
--- a/arch/arm64/include/asm/mte_tag_storage.h
+++ b/arch/arm64/include/asm/mte_tag_storage.h
@@ -6,8 +6,20 @@
 #define __ASM_MTE_TAG_STORAGE_H
 
 #ifdef CONFIG_ARM64_MTE_TAG_STORAGE
+
+DECLARE_STATIC_KEY_FALSE(tag_storage_enabled_key);
+
+static inline bool tag_storage_enabled(void)
+{
+   return static_branch_likely(_storage_enabled_key);
+}
+
 void mte_init_tag_storage(void);
 #else
+static inline bool tag_storage_enabled(void)
+{
+   return false;
+}
 static inline void mte_init_tag_storage(void)
 {
 }
diff --git a/arch/arm64/kernel/mte_tag_storage.c 
b/arch/arm64/kernel/mte_tag_storage.c
index 9a1a8a45171e..d58c68b4a849 100644
--- a/arch/arm64/kernel/mte_tag_storage.c
+++ b/arch/arm64/kernel/mte_tag_storage.c
@@ -19,6 +19,8 @@
 
 #include 
 
+__ro_after_init DEFINE_STATIC_KEY_FALSE(tag_storage_enabled_key);
+
 struct tag_region {
struct range mem_range; /* Memory associated with the tag storage, in 
PFNs. */
struct range tag_range; /* Tag storage memory, in PFNs. */
@@ -314,3 +316,51 @@ void __init mte_init_tag_storage(void)
num_tag_regions = 0;
pr_info("MTE tag storage region management disabled");
 }
+
+static int __init mte_enable_tag_storage(void)
+{
+   struct range *tag_range;
+   struct cma *cma;
+   int i, ret;
+
+   if (num_tag_regions == 0)
+   return 0;
+
+   for (i = 0; i < num_tag_regions; i++) {
+   tag_range = _regions[i].tag_range;
+   cma = tag_regions[i].cma;
+   /*
+* CMA will keep the pages as reserved when the region fails
+* activation.
+*/
+   if (PageReserved(pfn_to_page(tag_range->start)))
+   goto out_disabled;
+   }
+
+   static_branch_enable(_storage_enabled_key);
+   pr_info("MTE tag storage region management enabled");
+
+   return 0;
+
+out_disabled:
+   for (i = 0; i < num_tag_regions; i++) {
+   tag_range = _regions[i].tag_range;
+   cma = tag_regions[i].cma;
+
+   if (PageReserved(pfn_to_page(tag_range->start)))
+   continue;
+
+   /* Try really hard to reserve the tag storage. */
+   ret = cma_alloc(cma, range_len(tag_range), 8, true);
+   /*
+* Tag storage is still in use for data, memory and/or tag
+* corruption will ensue.
+*/
+   WARN_ON_ONCE(ret);
+   }
+   num_tag_regions = 0;
+   pr_info("MTE tag storage region management disabled");
+
+   return -EINVAL;
+}
+arch_initcall(mte_enable_tag_storage);
-- 
2.43.0




[PATCH RFC v3 21/35] arm64: mte: Disable dynamic tag storage management if HW KASAN is enabled

2024-01-25 Thread Alexandru Elisei
To be able to reserve the tag storage associated with a tagged page
requires that the tag storage can be migrated, if it's in use for data.

The kernel allocates pages in non-preemptible contexts, which makes
migration impossible. The only user of tagged pages in the kernel is HW
KASAN, so don't use tag storage pages if HW KASAN is enabled.

Signed-off-by: Alexandru Elisei 
---

Changes since rfc v2:

* Expanded commit message (David Hildenbrand)

 arch/arm64/kernel/mte_tag_storage.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/arch/arm64/kernel/mte_tag_storage.c 
b/arch/arm64/kernel/mte_tag_storage.c
index 90b157132efa..9a1a8a45171e 100644
--- a/arch/arm64/kernel/mte_tag_storage.c
+++ b/arch/arm64/kernel/mte_tag_storage.c
@@ -256,6 +256,16 @@ void __init mte_init_tag_storage(void)
goto out_disabled;
}
 
+   /*
+* The kernel allocates memory in non-preemptible contexts, which makes
+* migration impossible when reserving the associated tag storage. The
+* only in-kernel user of tagged pages is HW KASAN.
+*/
+   if (kasan_hw_tags_enabled()) {
+   pr_info("KASAN HW tags incompatible with MTE tag storage 
management");
+   goto out_disabled;
+   }
+
/*
 * Check that tag storage is addressable by the kernel.
 * cma_init_reserved_mem(), unlike cma_declare_contiguous_nid(), doesn't
-- 
2.43.0




[PATCH RFC v3 20/35] arm64: mte: Add tag storage memory to CMA

2024-01-25 Thread Alexandru Elisei
Add the MTE tag storage pages to CMA, which allows the page allocator to
manage them like regular pages.

The CMA migratype lends the tag storage pages some very desirable
properties:

* They cannot be longterm pinned, meaning they should always be migratable.

* The pages can be allocated explicitely by using their PFN (with
  alloc_cma_range()) when they are needed to store tags.

Signed-off-by: Alexandru Elisei 
---

Changes since v2:

* Reworked from rfc v2 patch #12 ("arm64: mte: Add tag storage pages to the
MIGRATE_CMA migratetype").
* Tag storage memory is now added to the cma_areas array and will be managed
like a regular CMA region (David Hildenbrand).
* If a tag storage region spans multiple zones, CMA won't be able to activate
the region. Split such regions into multiple tag storage regions (Hyesoo Yu).

 arch/arm64/Kconfig  |   1 +
 arch/arm64/kernel/mte_tag_storage.c | 150 +++-
 2 files changed, 150 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 92d97930b56e..6f65e9005dc9 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -2085,6 +2085,7 @@ config ARM64_MTE
 if ARM64_MTE
 config ARM64_MTE_TAG_STORAGE
bool
+   select CONFIG_CMA
help
  Adds support for dynamic management of the memory used by the hardware
  for storing MTE tags. This memory, unlike normal memory, cannot be
diff --git a/arch/arm64/kernel/mte_tag_storage.c 
b/arch/arm64/kernel/mte_tag_storage.c
index 2f32265d8ad8..90b157132efa 100644
--- a/arch/arm64/kernel/mte_tag_storage.c
+++ b/arch/arm64/kernel/mte_tag_storage.c
@@ -5,6 +5,8 @@
  * Copyright (C) 2023 ARM Ltd.
  */
 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -22,6 +24,7 @@ struct tag_region {
struct range tag_range; /* Tag storage memory, in PFNs. */
u32 block_size_pages;   /* Tag block size, in pages. */
phandle mem_phandle;/* phandle for the associated memory node. */
+   struct cma *cma;/* CMA cookie */
 };
 
 #define MAX_TAG_REGIONS32
@@ -139,9 +142,88 @@ static int __init mte_find_tagged_memory_regions(void)
return -EINVAL;
 }
 
+static void __init mte_split_tag_region(struct tag_region *region, unsigned 
long last_tag_pfn)
+{
+   struct tag_region *new_region;
+   unsigned long last_mem_pfn;
+
+   new_region = _regions[num_tag_regions];
+   last_mem_pfn = region->mem_range.start + (last_tag_pfn - 
region->tag_range.start) * 32;
+
+   new_region->mem_range.start = last_mem_pfn + 1;
+   new_region->mem_range.end = region->mem_range.end;
+   region->mem_range.end = last_mem_pfn;
+
+   new_region->tag_range.start = last_tag_pfn + 1;
+   new_region->tag_range.end = region->tag_range.end;
+   region->tag_range.end = last_tag_pfn;
+
+   new_region->block_size_pages = region->block_size_pages;
+
+   num_tag_regions++;
+}
+
+/*
+ * Split any tag region that spans multiple zones - CMA will fail if that
+ * happens.
+ */
+static int __init mte_split_tag_regions(void)
+{
+   struct tag_region *region;
+   struct range *tag_range;
+   struct zone *zone;
+   unsigned long pfn;
+   int i;
+
+   for (i = 0; i < num_tag_regions; i++) {
+   region = _regions[i];
+   tag_range = >tag_range;
+   zone = page_zone(pfn_to_page(tag_range->start));
+
+   for (pfn = tag_range->start + 1; pfn <= tag_range->end; pfn++) {
+   if (page_zone(pfn_to_page(pfn)) == zone)
+   continue;
+
+   if (WARN_ON_ONCE(pfn % region->block_size_pages))
+   goto out_err;
+
+   if (num_tag_regions == MAX_TAG_REGIONS)
+   goto out_err;
+
+   mte_split_tag_region(_regions[i], pfn - 1);
+   /* Move on to the next region. */
+   break;
+   }
+   }
+
+   return 0;
+
+out_err:
+   pr_err("Error splitting tag storage region 0x%llx-0x%llx spanning 
multiple zones",
+   PFN_PHYS(tag_range->start), PFN_PHYS(tag_range->end + 1) - 1);
+   return -EINVAL;
+}
+
 void __init mte_init_tag_storage(void)
 {
-   int ret;
+   unsigned long long mem_end;
+   struct tag_region *region;
+   unsigned long pfn, order;
+   u64 start, end;
+   int i, j, ret;
+
+   /*
+* Tag storage memory requires that tag storage pages in use for data
+* are always migratable when they need to be repurposed to store tags.
+* If ARCH_KEEP_MEMBLOCK is enabled, kexec will not scan reserved
+* memblocks when trying to find a suitable location for the kernel
+* image. This means that kexec will not use tag storage pages for
+* copying the kernel, and the pa

[PATCH RFC v3 19/35] arm64: mte: Discover tag storage memory

2024-01-25 Thread Alexandru Elisei
Allow the kernel to get the base address, size, block size and associated
memory node for tag storage from the device tree blob.

A tag storage region represents the smallest contiguous memory region that
holds all the tags for the associated contiguous memory region which can be
tagged. For example, for a 32GB contiguous tagged memory the corresponding
tag storage region is exactly 1GB of contiguous memory, not two adjacent
512M of tag storage memory, nor one 2GB tag storage region.

Tag storage is described as reserved memory; future patches will teach the
kernel how to make use of it for data (non-tagged) allocations.

Signed-off-by: Alexandru Elisei 
---

Changes since rfc v2:

* Reworked from rfc v2 patch #11 ("arm64: mte: Reserve tag storage memory").
* Added device tree schema (Rob Herring)
* Tag storage memory is now described in the "reserved-memory" node (Rob
Herring).

 .../reserved-memory/arm,mte-tag-storage.yaml  |  78 +
 arch/arm64/Kconfig|  12 ++
 arch/arm64/include/asm/mte_tag_storage.h  |  16 ++
 arch/arm64/kernel/Makefile|   1 +
 arch/arm64/kernel/mte_tag_storage.c   | 158 ++
 arch/arm64/mm/init.c  |   3 +
 6 files changed, 268 insertions(+)
 create mode 100644 
Documentation/devicetree/bindings/reserved-memory/arm,mte-tag-storage.yaml
 create mode 100644 arch/arm64/include/asm/mte_tag_storage.h
 create mode 100644 arch/arm64/kernel/mte_tag_storage.c

diff --git 
a/Documentation/devicetree/bindings/reserved-memory/arm,mte-tag-storage.yaml 
b/Documentation/devicetree/bindings/reserved-memory/arm,mte-tag-storage.yaml
new file mode 100644
index ..a99aaa1e8b6e
--- /dev/null
+++ b/Documentation/devicetree/bindings/reserved-memory/arm,mte-tag-storage.yaml
@@ -0,0 +1,78 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/reserved-memory/arm,mte-tag-storage.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Tag storage memory for Memory Tagging Extension
+
+description: |
+  Description of the tag storage memory region that Linux can use to store
+  data when the associated memory is not tagged.
+
+  The reserved memory described by the node must also be described by a
+  standalone 'memory' node.
+
+maintainers:
+  - Alexandru Elisei 
+
+allOf:
+  - $ref: reserved-memory.yaml
+
+properties:
+  compatible:
+const: arm,mte-tag-storage
+
+  reg:
+description: |
+  Specifies the memory region that MTE uses for tag storage. The size of 
the
+  region must be equal to the size needed to store all the tags for the
+  associated tagged memory.
+
+  block-size:
+description: |
+  Specifies the minimum multiple of 4K bytes of tag storage where all the
+  tags stored in the block correspond to a contiguous memory region. This
+  is needed for platforms where the memory controller interleaves tag
+  writes to memory.
+
+  For example, if the memory controller interleaves tag writes for 256KB
+  of contiguous memory across 8K of tag storage (2-way interleave), then
+  the correct value for 'block-size' is 0x2000.
+
+  This value is a hardware property, independent of the selected kernel 
page
+  size.
+$ref: /schemas/types.yaml#/definitions/uint32
+
+  tagged-memory:
+description: |
+  Specifies the memory node, as a phandle, for which all the tags are
+  stored in the tag storage region.
+
+  The memory node must describe one contiguous memory region (i.e, the
+  'ranges' property of the memory node must have exactly one entry).
+$ref: /schemas/types.yaml#/definitions/phandle
+
+unevaluatedProperties: false
+
+required:
+  - compatible
+  - reg
+  - block-size
+  - tagged-memory
+  - reusable
+
+examples:
+  - |
+reserved-memory {
+  #address-cells = <2>;
+  #size-cells = <2>;
+
+  tags0: tag-storage@8f800 {
+compatible = "arm,mte-tag-storage";
+reg = <0x08 0xf800 0x00 0x400>;
+block-size = <0x1000>;
+tagged-memory = <>;
+reusable;
+  };
+};
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index aa7c1d435139..92d97930b56e 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -2082,6 +2082,18 @@ config ARM64_MTE
 
  Documentation/arch/arm64/memory-tagging-extension.rst.
 
+if ARM64_MTE
+config ARM64_MTE_TAG_STORAGE
+   bool
+   help
+ Adds support for dynamic management of the memory used by the hardware
+ for storing MTE tags. This memory, unlike normal memory, cannot be
+ tagged. When it is used to store tags for another memory location it
+ cannot be used for any type of allocation.
+
+ If unsure, say N
+endif # ARM64_MTE
+
 endmenu # "ARMv8.5 architectural features"
 
 menu "ARMv8.7 architectural features&q

[PATCH RFC v3 18/35] arm64: mte: Rename __GFP_ZEROTAGS to __GFP_TAGGED

2024-01-25 Thread Alexandru Elisei
__GFP_ZEROTAGS is used to instruct the page allocator to zero the tags at
the same time as the physical frame is zeroed. The name can be slightly
misleading, because it doesn't mean that the code will zero the tags
unconditionally, but that the tags will be zeroed if and only if the
physical frame is also zeroed (either __GFP_ZERO is set or init_on_alloc is
1).

Rename it to __GFP_TAGGED, in preparation for it to be used by the page
allocator to recognize when an allocation is tagged (has metadata).

Signed-off-by: Alexandru Elisei 
---
 arch/arm64/mm/fault.c  | 2 +-
 include/linux/gfp_types.h  | 6 +++---
 include/trace/events/mmflags.h | 2 +-
 mm/page_alloc.c| 2 +-
 mm/shmem.c | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 4d3f0a870ad8..c022e473c17c 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -944,7 +944,7 @@ NOKPROBE_SYMBOL(do_debug_exception);
 gfp_t arch_calc_vma_gfp(struct vm_area_struct *vma, gfp_t gfp)
 {
if (vma->vm_flags & VM_MTE)
-   return __GFP_ZEROTAGS;
+   return __GFP_TAGGED;
return 0;
 }
 
diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h
index 1b6053da8754..f638353ebdc7 100644
--- a/include/linux/gfp_types.h
+++ b/include/linux/gfp_types.h
@@ -45,7 +45,7 @@ typedef unsigned int __bitwise gfp_t;
 #define ___GFP_HARDWALL0x10u
 #define ___GFP_THISNODE0x20u
 #define ___GFP_ACCOUNT 0x40u
-#define ___GFP_ZEROTAGS0x80u
+#define ___GFP_TAGGED  0x80u
 #ifdef CONFIG_KASAN_HW_TAGS
 #define ___GFP_SKIP_ZERO   0x100u
 #define ___GFP_SKIP_KASAN  0x200u
@@ -226,7 +226,7 @@ typedef unsigned int __bitwise gfp_t;
  *
  * %__GFP_ZERO returns a zeroed page on success.
  *
- * %__GFP_ZEROTAGS zeroes memory tags at allocation time if the memory itself
+ * %__GFP_TAGGED zeroes memory tags at allocation time if the memory itself
  * is being zeroed (either via __GFP_ZERO or via init_on_alloc, provided that
  * __GFP_SKIP_ZERO is not set). This flag is intended for optimization: setting
  * memory tags at the same time as zeroing memory has minimal additional
@@ -241,7 +241,7 @@ typedef unsigned int __bitwise gfp_t;
 #define __GFP_NOWARN   ((__force gfp_t)___GFP_NOWARN)
 #define __GFP_COMP ((__force gfp_t)___GFP_COMP)
 #define __GFP_ZERO ((__force gfp_t)___GFP_ZERO)
-#define __GFP_ZEROTAGS ((__force gfp_t)___GFP_ZEROTAGS)
+#define __GFP_TAGGED   ((__force gfp_t)___GFP_TAGGED)
 #define __GFP_SKIP_ZERO ((__force gfp_t)___GFP_SKIP_ZERO)
 #define __GFP_SKIP_KASAN ((__force gfp_t)___GFP_SKIP_KASAN)
 
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index d801409b33cf..6ca0d5ed46c0 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -50,7 +50,7 @@
gfpflag_string(__GFP_RECLAIM),  \
gfpflag_string(__GFP_DIRECT_RECLAIM),   \
gfpflag_string(__GFP_KSWAPD_RECLAIM),   \
-   gfpflag_string(__GFP_ZEROTAGS)
+   gfpflag_string(__GFP_TAGGED)
 
 #ifdef CONFIG_KASAN_HW_TAGS
 #define __def_gfpflag_names_kasan ,\
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 502ee3eb8583..0a0118612a13 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1480,7 +1480,7 @@ inline void post_alloc_hook(struct page *page, unsigned 
int order,
 {
bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) &&
!should_skip_init(gfp_flags);
-   bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS);
+   bool zero_tags = init && (gfp_flags & __GFP_TAGGED);
int i;
 
set_page_private(page, 0);
diff --git a/mm/shmem.c b/mm/shmem.c
index 621fabc3b8c6..3e28357b0a40 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1585,7 +1585,7 @@ static struct folio *shmem_swapin_cluster(swp_entry_t 
swap, gfp_t gfp,
  */
 static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
 {
-   gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM | __GFP_ZEROTAGS;
+   gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM | __GFP_TAGGED;
gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY;
gfp_t zoneflags = limit_gfp & GFP_ZONEMASK;
gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK);
-- 
2.43.0




[PATCH RFC v3 17/35] arm64: mte: Rework naming for tag manipulation functions

2024-01-25 Thread Alexandru Elisei
The tag save/restore/copy functions could be more explicit about from where
the tags are coming from and where they are being copied to. Renaming the
functions to make it easier to understand what they are doing:

- Rename the mte_clear_page_tags() 'addr' parameter to 'page_addr', to
  match the other functions that take a page address as parameter.

- Rename mte_save/restore_tags() to
  mte_save/restore_page_tags_by_swp_entry() to make it clear that they are
  saved in a collection indexed by swp_entry (this will become important
  when they will be also saved in a collection indexed by page pfn). Same
  applies to mte_invalidate_tags{,_area}_by_swp_entry().

- Rename mte_save/restore_page_tags() to make it clear where the tags are
  going to be saved, respectively from where they are restored - in a
  previously allocated memory buffer, not in an xarray, like when the tags
  are saved when swapping. Rename the action to 'copy' instead of
  'save'/'restore' to match the copy from user functions, which also copy
  tags to memory.

- Rename mte_allocate/free_tag_storage() to mte_allocate/free_tag_buf() to
  make it clear the functions have nothing to do with the memory where the
  corresponding tags for a page live. Change the parameter type for
  mte_free_tag_buf()) to be void *, to match the return value of
  mte_allocate_tag_buf(). Also do that because that memory is opaque and it
  is not meant to be directly deferenced.

In the name of consistency rename local variables from tag_storage to tags.
Give a similar treatment to the hibernation code that saves and restores
the tags for all tagged pages.

In the same spirit, rename MTE_PAGE_TAG_STORAGE to
MTE_PAGE_TAG_STORAGE_SIZE to make it clear that it relates to the size of
the memory needed to save the tags for a page. Oportunistically rename
MTE_TAG_SIZE to MTE_TAG_SIZE_BITS to make it clear it is measured in bits,
not bytes, like the rest of the size variable from the same header file.

Signed-off-by: Alexandru Elisei 
---
 arch/arm64/include/asm/mte-def.h | 16 +-
 arch/arm64/include/asm/mte.h | 23 +--
 arch/arm64/include/asm/pgtable.h |  8 ++---
 arch/arm64/kernel/elfcore.c  | 14 -
 arch/arm64/kernel/hibernate.c| 46 ++---
 arch/arm64/lib/mte.S | 18 ++--
 arch/arm64/mm/mteswap.c  | 50 
 7 files changed, 90 insertions(+), 85 deletions(-)

diff --git a/arch/arm64/include/asm/mte-def.h b/arch/arm64/include/asm/mte-def.h
index 14ee86b019c2..eb0d76a6bdcf 100644
--- a/arch/arm64/include/asm/mte-def.h
+++ b/arch/arm64/include/asm/mte-def.h
@@ -5,14 +5,14 @@
 #ifndef __ASM_MTE_DEF_H
 #define __ASM_MTE_DEF_H
 
-#define MTE_GRANULE_SIZE   UL(16)
-#define MTE_GRANULE_MASK   (~(MTE_GRANULE_SIZE - 1))
-#define MTE_GRANULES_PER_PAGE  (PAGE_SIZE / MTE_GRANULE_SIZE)
-#define MTE_TAG_SHIFT  56
-#define MTE_TAG_SIZE   4
-#define MTE_TAG_MASK   GENMASK((MTE_TAG_SHIFT + (MTE_TAG_SIZE - 1)), 
MTE_TAG_SHIFT)
-#define MTE_PAGE_TAG_STORAGE   (MTE_GRANULES_PER_PAGE * MTE_TAG_SIZE / 8)
+#define MTE_GRANULE_SIZE   UL(16)
+#define MTE_GRANULE_MASK   (~(MTE_GRANULE_SIZE - 1))
+#define MTE_GRANULES_PER_PAGE  (PAGE_SIZE / MTE_GRANULE_SIZE)
+#define MTE_TAG_SHIFT  56
+#define MTE_TAG_SIZE_BITS  4
+#define MTE_TAG_MASK   GENMASK((MTE_TAG_SHIFT + (MTE_TAG_SIZE_BITS - 
1)), MTE_TAG_SHIFT)
+#define MTE_PAGE_TAG_STORAGE_SIZE  (MTE_GRANULES_PER_PAGE * 
MTE_TAG_SIZE_BITS / 8)
 
-#define __MTE_PREAMBLE ARM64_ASM_PREAMBLE ".arch_extension memtag\n"
+#define __MTE_PREAMBLE ARM64_ASM_PREAMBLE ".arch_extension 
memtag\n"
 
 #endif /* __ASM_MTE_DEF_H  */
diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h
index 91fbd5c8a391..8034695b3dd7 100644
--- a/arch/arm64/include/asm/mte.h
+++ b/arch/arm64/include/asm/mte.h
@@ -18,19 +18,24 @@
 
 #include 
 
-void mte_clear_page_tags(void *addr);
+void mte_clear_page_tags(void *page_addr);
+
 unsigned long mte_copy_tags_from_user(void *to, const void __user *from,
  unsigned long n);
 unsigned long mte_copy_tags_to_user(void __user *to, void *from,
unsigned long n);
-int mte_save_tags(struct page *page);
-void mte_save_page_tags(const void *page_addr, void *tag_storage);
-void mte_restore_tags(swp_entry_t entry, struct page *page);
-void mte_restore_page_tags(void *page_addr, const void *tag_storage);
-void mte_invalidate_tags(int type, pgoff_t offset);
-void mte_invalidate_tags_area(int type);
-void *mte_allocate_tag_storage(void);
-void mte_free_tag_storage(char *storage);
+
+int mte_save_page_tags_by_swp_entry(struct page *page);
+void mte_restore_page_tags_by_swp_entry(swp_entry_t entry, struct page *page);
+
+void mte_copy_page_tags_to_buf(const void *page_addr, void *to);
+void mte_copy_pag

[PATCH RFC v3 16/35] KVM: arm64: Don't deny VM_PFNMAP VMAs when kvm_has_mte()

2024-01-25 Thread Alexandru Elisei
According to ARM DDI 0487J.a, page D10-5976, a memory location which
doesn't have the Normal memory attribute is considered Untagged, and
accesses are Tag Unchecked. Tag reads from an Untagged address return
0b, and writes are ignored.

Linux uses VM_PFNMAP VMAs represent device memory, and Linux doesn't set
the VM_MTE_ALLOWED flag for these VMAs.

In user_mem_abort(), KVM requires that all VMAs that back guest memory must
allow tagging (VM_MTE_ALLOWED flag set), except for VMAs that represent
device memory.  When a memslot is created or changed, KVM enforces a
different behaviour: **all** VMAs that intersect the memslot must allow
tagging, even those that represent device memory. This is too restrictive,
and can lead to inconsistent behaviour: a VM_PFNMAP VMA that is present
when a memslot is created causes KVM_SET_USER_MEMORY_REGION to fail, but if
such a VMA is created after the memslot has been created, the virtual
machine will run without errors.

Change kvm_arch_prepare_memory_region() to allow VM_PFNMAP VMAs when the VM
has the MTE capability enabled.

Signed-off-by: Alexandru Elisei 
---

Changes from rfc v2:

* New patch. It's a fix, and can be taken independently of the series.

 arch/arm64/kvm/mmu.c | 8 +++-
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index d14504821b79..b7517c4a19c4 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -2028,17 +2028,15 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
if (!vma)
break;
 
-   if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) {
-   ret = -EINVAL;
-   break;
-   }
-
if (vma->vm_flags & VM_PFNMAP) {
/* IO region dirty page logging not allowed */
if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
ret = -EINVAL;
break;
}
+   } else if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) {
+   ret = -EINVAL;
+   break;
}
hva = min(reg_end, vma->vm_end);
} while (hva < reg_end);
-- 
2.43.0




[PATCH RFC v3 15/35] of: fdt: Add of_flat_read_u32()

2024-01-25 Thread Alexandru Elisei
Add the function of_flat_read_u32() to return the value of a property as
an u32.

Signed-off-by: Alexandru Elisei 
---

Changes since rfc v2:

* New patch, suggested by Rob Herring.

 drivers/of/fdt.c   | 21 +
 include/linux/of_fdt.h |  2 ++
 2 files changed, 23 insertions(+)

diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index bf502ba8da95..dfcd79fd5fd9 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -755,6 +755,27 @@ const void *__init of_get_flat_dt_prop(unsigned long node, 
const char *name,
return fdt_getprop(initial_boot_params, node, name, size);
 }
 
+/*
+ * of_flat_read_u32 - Return the value of the given property as an u32.
+ *
+ * @node: device node from which the property value is to be read
+ * @propname: name of the property
+ * @out_value: the value of the property
+ * @return: 0 on success, -EINVAL if property does not exist
+ */
+int __init of_flat_read_u32(unsigned long node, const char *propname,
+   u32 *out_value)
+{
+   const __be32 *reg;
+
+   reg = of_get_flat_dt_prop(node, propname, NULL);
+   if (!reg)
+   return -EINVAL;
+
+   *out_value = be32_to_cpup(reg);
+   return 0;
+}
+
 /**
  * of_fdt_is_compatible - Return true if given node from the given blob has
  * compat in its compatible list
diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
index 0e26f8c3b10e..d7901699061b 100644
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -57,6 +57,8 @@ extern const void *of_get_flat_dt_prop(unsigned long node, 
const char *name,
 extern int of_flat_dt_is_compatible(unsigned long node, const char *name);
 extern unsigned long of_get_flat_dt_root(void);
 extern uint32_t of_get_flat_dt_phandle(unsigned long node);
+extern int of_flat_read_u32(unsigned long node, const char *propname,
+   u32 *out_value);
 
 extern int early_init_dt_scan_chosen(char *cmdline);
 extern int early_init_dt_scan_memory(void);
-- 
2.43.0




[PATCH RFC v3 14/35] of: fdt: Return the region size in of_flat_dt_translate_address()

2024-01-25 Thread Alexandru Elisei
Alongside the base address, arm64 will also need to know the size of a
tag storage region. Teach of_flat_dt_translate_address() to parse and
return the size.

Signed-off-by: Alexandru Elisei 
---

Changes since rfc v2:

* New patch, suggested by Rob Herring.

 arch/sh/kernel/cpu/sh2/probe.c |  2 +-
 drivers/of/fdt_address.c   | 12 +---
 drivers/tty/serial/earlycon.c  |  2 +-
 include/linux/of_fdt.h |  2 +-
 4 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/arch/sh/kernel/cpu/sh2/probe.c b/arch/sh/kernel/cpu/sh2/probe.c
index 70a07f4f2142..fa8904e8f390 100644
--- a/arch/sh/kernel/cpu/sh2/probe.c
+++ b/arch/sh/kernel/cpu/sh2/probe.c
@@ -21,7 +21,7 @@ static int __init scan_cache(unsigned long node, const char 
*uname,
if (!of_flat_dt_is_compatible(node, "jcore,cache"))
return 0;
 
-   j2_ccr_base = ioremap(of_flat_dt_translate_address(node), 4);
+   j2_ccr_base = ioremap(of_flat_dt_translate_address(node, NULL), 4);
 
return 1;
 }
diff --git a/drivers/of/fdt_address.c b/drivers/of/fdt_address.c
index 1dc15ab78b10..4c08d710 100644
--- a/drivers/of/fdt_address.c
+++ b/drivers/of/fdt_address.c
@@ -160,7 +160,8 @@ static int __init fdt_translate_one(const void *blob, int 
parent,
  * that can be mapped to a cpu physical address). This is not really specified
  * that way, but this is traditionally the way IBM at least do things
  */
-static u64 __init fdt_translate_address(const void *blob, int node_offset)
+static u64 __init fdt_translate_address(const void *blob, int node_offset,
+   u64 *out_size)
 {
int parent, len;
const struct of_bus *bus, *pbus;
@@ -193,6 +194,9 @@ static u64 __init fdt_translate_address(const void *blob, 
int node_offset)
goto bail;
}
memcpy(addr, reg, na * 4);
+   /* The size of the region doesn't need translating. */
+   if (out_size)
+   *out_size = of_read_number(reg + na, ns);
 
pr_debug("bus (na=%d, ns=%d) on %s\n",
 na, ns, fdt_get_name(blob, parent, NULL));
@@ -242,8 +246,10 @@ static u64 __init fdt_translate_address(const void *blob, 
int node_offset)
 /**
  * of_flat_dt_translate_address - translate DT addr into CPU phys addr
  * @node: node in the flat blob
+ * @out_size: size of the region, can be NULL if not needed
+ * @return: the address, OF_BAD_ADDR in case of error
  */
-u64 __init of_flat_dt_translate_address(unsigned long node)
+u64 __init of_flat_dt_translate_address(unsigned long node, u64 *out_size)
 {
-   return fdt_translate_address(initial_boot_params, node);
+   return fdt_translate_address(initial_boot_params, node, out_size);
 }
diff --git a/drivers/tty/serial/earlycon.c b/drivers/tty/serial/earlycon.c
index a5fbb6ed38ae..e941cf786232 100644
--- a/drivers/tty/serial/earlycon.c
+++ b/drivers/tty/serial/earlycon.c
@@ -265,7 +265,7 @@ int __init of_setup_earlycon(const struct earlycon_id 
*match,
 
spin_lock_init(>lock);
port->iotype = UPIO_MEM;
-   addr = of_flat_dt_translate_address(node);
+   addr = of_flat_dt_translate_address(node, NULL);
if (addr == OF_BAD_ADDR) {
pr_warn("[%s] bad address\n", match->name);
return -ENXIO;
diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
index d69ad5bb1eb1..0e26f8c3b10e 100644
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -36,7 +36,7 @@ extern char __dtb_start[];
 extern char __dtb_end[];
 
 /* Other Prototypes */
-extern u64 of_flat_dt_translate_address(unsigned long node);
+extern u64 of_flat_dt_translate_address(unsigned long node, u64 *out_size);
 extern void of_fdt_limit_memory(int limit);
 #endif /* CONFIG_OF_FLATTREE */
 
-- 
2.43.0




[PATCH RFC v3 13/35] mm: memory: Introduce fault-on-access mechanism for pages

2024-01-25 Thread Alexandru Elisei
Introduce a mechanism that allows an architecture to trigger a page fault,
and add the infrastructure to handle that fault accordingly. To use make
use of this, an arch is expected to mark the table entry as PAGE_NONE (which
will cause a fault next time it is accessed) and to implement an
arch-specific method (like a software bit) for recognizing that the fault
needs to be handled by the arch code.

arm64 will use of this approach to reserve tag storage for pages which are
mapped in an MTE enabled VMA, but the storage needed to store tags isn't
reserved (for example, because of an mprotect(PROT_MTE) call on a VMA with
existing pages).

Signed-off-by: Alexandru Elisei 
---

Changes since rfc v2:

* New patch. Split from patch #19 ("mm: mprotect: Introduce PAGE_FAULT_ON_ACCESS
for mprotect(PROT_MTE)") (David Hildenbrand).

 include/linux/huge_mm.h |  4 ++--
 include/linux/pgtable.h | 47 +++--
 mm/Kconfig  |  3 +++
 mm/huge_memory.c| 36 +
 mm/memory.c | 51 ++---
 5 files changed, 109 insertions(+), 32 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 5adb86af35fc..4678a0a5e6a8 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -346,7 +346,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, 
unsigned long addr,
 struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
pud_t *pud, int flags, struct dev_pagemap **pgmap);
 
-vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf);
+vm_fault_t handle_huge_pmd_protnone(struct vm_fault *vmf);
 
 extern struct page *huge_zero_page;
 extern unsigned long huge_zero_pfn;
@@ -476,7 +476,7 @@ static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
return NULL;
 }
 
-static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
+static inline vm_fault_t handle_huge_pmd_protnone(struct vm_fault *vmf)
 {
return 0;
 }
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 2d0f04042f62..81a21be855a2 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1455,7 +1455,7 @@ static inline int pud_trans_unstable(pud_t *pud)
return 0;
 }
 
-#ifndef CONFIG_NUMA_BALANCING
+#if !defined(CONFIG_NUMA_BALANCING) && 
!defined(CONFIG_ARCH_HAS_FAULT_ON_ACCESS)
 /*
  * In an inaccessible (PROT_NONE) VMA, pte_protnone() may indicate "yes". It is
  * perfectly valid to indicate "no" in that case, which is why our default
@@ -1477,7 +1477,50 @@ static inline int pmd_protnone(pmd_t pmd)
 {
return 0;
 }
-#endif /* CONFIG_NUMA_BALANCING */
+#endif /* !CONFIG_NUMA_BALANCING && !CONFIG_ARCH_HAS_FAULT_ON_ACCESS */
+
+#ifndef CONFIG_ARCH_HAS_FAULT_ON_ACCESS
+static inline bool arch_fault_on_access_pte(pte_t pte)
+{
+   return false;
+}
+
+static inline bool arch_fault_on_access_pmd(pmd_t pmd)
+{
+   return false;
+}
+
+/*
+ * The function is called with the fault lock held and an elevated reference on
+ * the folio.
+ *
+ * Rules that an arch implementation of the function must follow:
+ *
+ * 1. The function must return with the elevated reference dropped.
+ *
+ * 2. If the return value contains VM_FAULT_RETRY or VM_FAULT_COMPLETED then:
+ *
+ * - if FAULT_FLAG_RETRY_NOWAIT is not set, the function must return with the
+ *   correct fault lock released, which can be accomplished with
+ *   release_fault_lock(vmf). Note that release_fault_lock() doesn't check if
+ *   FAULT_FLAG_RETRY_NOWAIT is set before releasing the mmap_lock.
+ *
+ * - if FAULT_FLAG_RETRY_NOWAIT is set, then the function must not release the
+ *   mmap_lock. The flag should be set only if the mmap_lock is held.
+ *
+ * 3. If the return value contains neither of the above, the function must not
+ * release the fault lock; the generic fault handler will take care of 
releasing
+ * the correct lock.
+ */
+static inline vm_fault_t arch_handle_folio_fault_on_access(struct folio *folio,
+  struct vm_fault *vmf,
+  bool *map_pte)
+{
+   *map_pte = false;
+
+   return VM_FAULT_SIGBUS;
+}
+#endif
 
 #endif /* CONFIG_MMU */
 
diff --git a/mm/Kconfig b/mm/Kconfig
index 341cf53898db..153df67221f1 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1006,6 +1006,9 @@ config IDLE_PAGE_TRACKING
 config ARCH_HAS_CACHE_LINE_SIZE
bool
 
+config ARCH_HAS_FAULT_ON_ACCESS
+   bool
+
 config ARCH_HAS_CURRENT_STACK_POINTER
bool
help
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 94ef5c02b459..2bad63a7ec16 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1698,7 +1698,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct 
*vma,
 }
 
 /* NUMA hinting page fault entry point for trans huge pmds */
-vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vm

[PATCH RFC v3 12/35] mm: Call arch_swap_prepare_to_restore() before arch_swap_restore()

2024-01-25 Thread Alexandru Elisei
arm64 uses arch_swap_restore() to restore saved tags before the page is
swapped in and it's called in atomic context (with the ptl lock held).

Introduce arch_swap_prepare_to_restore() that will allow an architecture to
perform extra work during swap in and outside of a critical section.
This will be used by arm64 to allocate a buffer in memory where to
temporarily save tags if tag storage is not available for the page being
swapped in.

Signed-off-by: Alexandru Elisei 
---
 include/linux/pgtable.h | 7 +++
 mm/memory.c | 4 
 mm/shmem.c  | 9 +
 mm/swapfile.c   | 5 +
 4 files changed, 25 insertions(+)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 98f81ca08cbe..2d0f04042f62 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -959,6 +959,13 @@ static inline void arch_swap_invalidate_area(int type)
 }
 #endif
 
+#ifndef __HAVE_ARCH_SWAP_PREPARE_TO_RESTORE
+static inline vm_fault_t arch_swap_prepare_to_restore(swp_entry_t entry, 
struct folio *folio)
+{
+   return 0;
+}
+#endif
+
 #ifndef __HAVE_ARCH_SWAP_RESTORE
 static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
 {
diff --git a/mm/memory.c b/mm/memory.c
index 7e1f4849463a..8a421e168b57 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3975,6 +3975,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 
folio_throttle_swaprate(folio, GFP_KERNEL);
 
+   ret = arch_swap_prepare_to_restore(entry, folio);
+   if (ret)
+   goto out_page;
+
/*
 * Back out if somebody else already faulted in this pte.
 */
diff --git a/mm/shmem.c b/mm/shmem.c
index 14427e9982f9..621fabc3b8c6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1855,6 +1855,7 @@ static int shmem_swapin_folio(struct inode *inode, 
pgoff_t index,
struct swap_info_struct *si;
struct folio *folio = NULL;
swp_entry_t swap;
+   vm_fault_t ret;
int error;
 
VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
@@ -1903,6 +1904,14 @@ static int shmem_swapin_folio(struct inode *inode, 
pgoff_t index,
}
folio_wait_writeback(folio);
 
+   ret = arch_swap_prepare_to_restore(swap, folio);
+   if (ret) {
+   if (fault_type)
+   *fault_type = ret;
+   error = -EINVAL;
+   goto unlock;
+   }
+
/*
 * Some architectures may have to restore extra metadata to the
 * folio after reading from swap.
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 556ff7347d5f..49425598f778 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1785,6 +1785,11 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t 
*pmd,
goto setpte;
}
 
+   if (arch_swap_prepare_to_restore(entry, folio)) {
+   ret = -EINVAL;
+   goto out;
+   }
+
/*
 * Some architectures may have to restore extra metadata to the page
 * when reading from swap. This metadata may be indexed by swap entry
-- 
2.43.0




[PATCH RFC v3 11/35] mm: Allow an arch to hook into folio allocation when VMA is known

2024-01-25 Thread Alexandru Elisei
arm64 uses VM_HIGH_ARCH_0 and VM_HIGH_ARCH_1 for enabling MTE for a VMA.
When VM_HIGH_ARCH_0, which arm64 renames to VM_MTE, is set for a VMA, and
the gfp flag __GFP_ZERO is present, the __GFP_ZEROTAGS gfp flag also gets
set in vma_alloc_zeroed_movable_folio().

Expand this to be more generic by adding an arch hook that modifes the gfp
flags for an allocation when the VMA is known.

Note that __GFP_ZEROTAGS is ignored by the page allocator unless __GFP_ZERO
is also set; from that point of view, the current behaviour is unchanged,
even though the arm64 flag is set in more places.  When arm64 will have
support to reuse the tag storage for data allocation, the uses of the
__GFP_ZEROTAGS flag will be expanded to instruct the page allocator to try
to reserve the corresponding tag storage for the pages being allocated.

The flags returned by arch_calc_vma_gfp() are or'ed with the flags set by
the caller; this has been done to keep an architecture from modifying the
flags already set by the core memory management code; this is similar to
how do_mmap() -> calc_vm_flag_bits() -> arch_calc_vm_flag_bits() has been
implemented. This can be revisited in the future if there's a need to do
so.

Signed-off-by: Alexandru Elisei 
---
 arch/arm64/include/asm/page.h|  5 ++---
 arch/arm64/include/asm/pgtable.h |  3 +++
 arch/arm64/mm/fault.c| 19 ++-
 include/linux/pgtable.h  |  7 +++
 mm/mempolicy.c   |  1 +
 mm/shmem.c   |  5 -
 6 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h
index 2312e6ee595f..88bab032a493 100644
--- a/arch/arm64/include/asm/page.h
+++ b/arch/arm64/include/asm/page.h
@@ -29,9 +29,8 @@ void copy_user_highpage(struct page *to, struct page *from,
 void copy_highpage(struct page *to, struct page *from);
 #define __HAVE_ARCH_COPY_HIGHPAGE
 
-struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma,
-   unsigned long vaddr);
-#define vma_alloc_zeroed_movable_folio vma_alloc_zeroed_movable_folio
+#define vma_alloc_zeroed_movable_folio(vma, vaddr) \
+   vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false)
 
 void tag_clear_highpage(struct page *to);
 #define __HAVE_ARCH_TAG_CLEAR_HIGHPAGE
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 79ce70fbb751..08f0904dbfc2 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1071,6 +1071,9 @@ static inline void arch_swap_restore(swp_entry_t entry, 
struct folio *folio)
 
 #endif /* CONFIG_ARM64_MTE */
 
+#define __HAVE_ARCH_CALC_VMA_GFP
+gfp_t arch_calc_vma_gfp(struct vm_area_struct *vma, gfp_t gfp);
+
 /*
  * On AArch64, the cache coherency is handled via the set_pte_at() function.
  */
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 55f6455a8284..4d3f0a870ad8 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -937,22 +937,15 @@ void do_debug_exception(unsigned long addr_if_watchpoint, 
unsigned long esr,
 NOKPROBE_SYMBOL(do_debug_exception);
 
 /*
- * Used during anonymous page fault handling.
+ * If this is called during anonymous page fault handling, and the page is
+ * mapped with PROT_MTE, initialise the tags at the point of tag zeroing as 
this
+ * is usually faster than separate DC ZVA and STGM.
  */
-struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma,
-   unsigned long vaddr)
+gfp_t arch_calc_vma_gfp(struct vm_area_struct *vma, gfp_t gfp)
 {
-   gfp_t flags = GFP_HIGHUSER_MOVABLE | __GFP_ZERO;
-
-   /*
-* If the page is mapped with PROT_MTE, initialise the tags at the
-* point of allocation and page zeroing as this is usually faster than
-* separate DC ZVA and STGM.
-*/
if (vma->vm_flags & VM_MTE)
-   flags |= __GFP_ZEROTAGS;
-
-   return vma_alloc_folio(flags, 0, vma, vaddr, false);
+   return __GFP_ZEROTAGS;
+   return 0;
 }
 
 void tag_clear_highpage(struct page *page)
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index c5ddec6b5305..98f81ca08cbe 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -901,6 +901,13 @@ static inline void arch_do_swap_page(struct mm_struct *mm,
 }
 #endif
 
+#ifndef __HAVE_ARCH_CALC_VMA_GFP
+static inline gfp_t arch_calc_vma_gfp(struct vm_area_struct *vma, gfp_t gfp)
+{
+   return 0;
+}
+#endif
+
 #ifndef __HAVE_ARCH_FREE_PAGES_PREPARE
 static inline void arch_free_pages_prepare(struct page *page, int order) { }
 #endif
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 10a590ee1c89..f7ef52760b32 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2168,6 +2168,7 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, 
struct vm_area_struct *vma,
pgoff_t ilx;
struct page *page;

[PATCH RFC v3 10/35] mm: cma: Fast track allocating memory when the pages are free

2024-01-25 Thread Alexandru Elisei
If the pages to be allocated are free, take them directly off the buddy
allocator, instead of going through alloc_contig_range() and avoiding
costly calls to lru_cache_disable().

Only allocations of the same size as the CMA region order are considered,
to avoid taking the zone spinlock for too long.

Signed-off-by: Alexandru Elisei 
---

Changes since rfc v2:

* New patch. Reworked from the rfc v2 patch #26 ("arm64: mte: Fast track
reserving tag storage when the block is free") (David Hildenbrand).

 include/linux/page-flags.h | 15 --
 mm/Kconfig |  5 +
 mm/cma.c   | 42 ++
 mm/memory-failure.c|  8 
 mm/page_alloc.c| 23 -
 5 files changed, 73 insertions(+), 20 deletions(-)

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 735cddc13d20..b7237bce7446 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -575,11 +575,22 @@ TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
 #define MAGIC_HWPOISON 0x48575053U /* HWPS */
 extern void SetPageHWPoisonTakenOff(struct page *page);
 extern void ClearPageHWPoisonTakenOff(struct page *page);
-extern bool take_page_off_buddy(struct page *page);
-extern bool put_page_back_buddy(struct page *page);
+extern bool PageHWPoisonTakenOff(struct page *page);
 #else
 PAGEFLAG_FALSE(HWPoison, hwpoison)
+TESTSCFLAG_FALSE(HWPoison, hwpoison)
 #define __PG_HWPOISON 0
+static inline void SetPageHWPoisonTakenOff(struct page *page) { }
+static inline void ClearPageHWPoisonTakenOff(struct page *page) { }
+static inline bool PageHWPoisonTakenOff(struct page *page)
+{
+  return false;
+}
+#endif
+
+#ifdef CONFIG_WANTS_TAKE_PAGE_OFF_BUDDY
+extern bool take_page_off_buddy(struct page *page, bool poison);
+extern bool put_page_back_buddy(struct page *page, bool unpoison);
 #endif
 
 #if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
diff --git a/mm/Kconfig b/mm/Kconfig
index ffc3a2ba3a8c..341cf53898db 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -745,12 +745,16 @@ config DEFAULT_MMAP_MIN_ADDR
 config ARCH_SUPPORTS_MEMORY_FAILURE
bool
 
+config WANTS_TAKE_PAGE_OFF_BUDDY
+   bool
+
 config MEMORY_FAILURE
depends on MMU
depends on ARCH_SUPPORTS_MEMORY_FAILURE
bool "Enable recovery from hardware memory errors"
select MEMORY_ISOLATION
select RAS
+   select WANTS_TAKE_PAGE_OFF_BUDDY
help
  Enables code to recover from some memory failures on systems
  with MCA recovery. This allows a system to continue running
@@ -891,6 +895,7 @@ config CMA
depends on MMU
select MIGRATION
select MEMORY_ISOLATION
+   select WANTS_TAKE_PAGE_OFF_BUDDY
help
  This enables the Contiguous Memory Allocator which allows other
  subsystems to allocate big physically-contiguous blocks of memory.
diff --git a/mm/cma.c b/mm/cma.c
index 2881bab12b01..15663f95d77b 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -444,6 +444,34 @@ static void cma_debug_show_areas(struct cma *cma)
 static inline void cma_debug_show_areas(struct cma *cma) { }
 #endif
 
+/* Called with the cma mutex held. */
+static int cma_alloc_pages_fastpath(struct cma *cma, unsigned long start,
+   unsigned long end)
+{
+   bool success = false;
+   unsigned long i, j;
+
+   /* Avoid contention on the zone lock. */
+   if (start - end != 1 << cma->order_per_bit)
+   return -EINVAL;
+
+   for (i = start; i < end; i++) {
+   if (!is_free_buddy_page(pfn_to_page(i)))
+   break;
+   success = take_page_off_buddy(pfn_to_page(i), false);
+   if (!success)
+   break;
+   }
+
+   if (success)
+   return 0;
+
+   for (j = start; j < i; j++)
+   put_page_back_buddy(pfn_to_page(j), false);
+
+   return -EBUSY;
+}
+
 /**
  * cma_alloc_range() - allocate pages in a specific range
  * @cma:   Contiguous memory region for which the allocation is performed.
@@ -493,7 +521,11 @@ int cma_alloc_range(struct cma *cma, unsigned long start, 
unsigned long count,
 
for (i = 0; i < tries; i++) {
mutex_lock(_mutex);
-   err = alloc_contig_range(start, start + count, MIGRATE_CMA, 
gfp);
+   err = cma_alloc_pages_fastpath(cma, start, start + count);
+   if (err) {
+   err = alloc_contig_range(start, start + count,
+MIGRATE_CMA, gfp);
+   }
mutex_unlock(_mutex);
 
if (err != -EBUSY)
@@ -529,7 +561,6 @@ int cma_alloc_range(struct cma *cma, unsigned long start, 
unsigned long count,
return err;
 }
 
-
 /**
  * cma_alloc() - allocate pages from contiguous area
  * @cma:   Contiguous memor

[PATCH RFC v3 09/35] mm: cma: Introduce cma_remove_mem()

2024-01-25 Thread Alexandru Elisei
Memory is added to CMA with cma_declare_contiguous_nid() and
cma_init_reserved_mem(). This memory is then put on the MIGRATE_CMA list in
cma_init_reserved_areas(), where the page allocator can make use of it.

If a device manages multiple CMA areas, and there's an error when one of
the areas is added to CMA, there is no mechanism for the device to prevent
the rest of the areas, which were added before the error occured, from
being later added to the MIGRATE_CMA list.

Add cma_remove_mem() which allows a previously reserved CMA area to be
removed and thus it cannot be used by the page allocator.

Signed-off-by: Alexandru Elisei 
---

Changes since rfc v2:

* New patch.

 include/linux/cma.h |  1 +
 mm/cma.c| 30 +-
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/include/linux/cma.h b/include/linux/cma.h
index e32559da6942..787cbec1702e 100644
--- a/include/linux/cma.h
+++ b/include/linux/cma.h
@@ -48,6 +48,7 @@ extern int cma_init_reserved_mem(phys_addr_t base, 
phys_addr_t size,
unsigned int order_per_bit,
const char *name,
struct cma **res_cma);
+extern void cma_remove_mem(struct cma **res_cma);
 extern struct page *cma_alloc(struct cma *cma, unsigned long count, unsigned 
int align,
  bool no_warn);
 extern int cma_alloc_range(struct cma *cma, unsigned long start, unsigned long 
count,
diff --git a/mm/cma.c b/mm/cma.c
index 4a0f68b9443b..2881bab12b01 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -147,8 +147,12 @@ static int __init cma_init_reserved_areas(void)
 {
int i;
 
-   for (i = 0; i < cma_area_count; i++)
+   for (i = 0; i < cma_area_count; i++) {
+   /* Region was removed. */
+   if (!cma_areas[i].count)
+   continue;
cma_activate_area(_areas[i]);
+   }
 
return 0;
 }
@@ -216,6 +220,30 @@ int __init cma_init_reserved_mem(phys_addr_t base, 
phys_addr_t size,
return 0;
 }
 
+/**
+ * cma_remove_mem() - remove cma area
+ * @res_cma: Pointer to the cma region.
+ *
+ * This function removes a cma region created with cma_init_reserved_mem(). The
+ * ->count is set to 0.
+ */
+void __init cma_remove_mem(struct cma **res_cma)
+{
+   struct cma *cma;
+
+   if (WARN_ON_ONCE(!res_cma || !(*res_cma)))
+   return;
+
+   cma = *res_cma;
+   if (WARN_ON_ONCE(!cma->count))
+   return;
+
+   totalcma_pages -= cma->count;
+   cma->count = 0;
+
+   *res_cma = NULL;
+}
+
 /**
  * cma_declare_contiguous_nid() - reserve custom contiguous area
  * @base: Base address of the reserved area optional, use 0 for any
-- 
2.43.0




[PATCH RFC v3 08/35] mm: cma: Introduce cma_alloc_range()

2024-01-25 Thread Alexandru Elisei
Today, cma_alloc() is used to allocate a contiguous memory region. The
function allows the caller to specify the number of pages to allocate, but
not the starting address. cma_alloc() will walk over the entire CMA region
trying to allocate the first available range of the specified size.

Introduce cma_alloc_range(), which makes CMA more versatile by allowing the
caller to specify a particular range in the CMA region, defined by the
start pfn and the size.

arm64 will make use of this function when tag storage management will be
implemented: cma_alloc_range() will be used to reserve the tag storage
associated with a tagged page.

Signed-off-by: Alexandru Elisei 
---

Changes since rfc v2:

* New patch.

 include/linux/cma.h|  2 +
 include/trace/events/cma.h | 59 ++
 mm/cma.c   | 86 ++
 3 files changed, 147 insertions(+)

diff --git a/include/linux/cma.h b/include/linux/cma.h
index 63873b93deaa..e32559da6942 100644
--- a/include/linux/cma.h
+++ b/include/linux/cma.h
@@ -50,6 +50,8 @@ extern int cma_init_reserved_mem(phys_addr_t base, 
phys_addr_t size,
struct cma **res_cma);
 extern struct page *cma_alloc(struct cma *cma, unsigned long count, unsigned 
int align,
  bool no_warn);
+extern int cma_alloc_range(struct cma *cma, unsigned long start, unsigned long 
count,
+  unsigned tries, gfp_t gfp);
 extern bool cma_pages_valid(struct cma *cma, const struct page *pages, 
unsigned long count);
 extern bool cma_release(struct cma *cma, const struct page *pages, unsigned 
long count);
 
diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h
index 25103e67737c..a89af313a572 100644
--- a/include/trace/events/cma.h
+++ b/include/trace/events/cma.h
@@ -36,6 +36,65 @@ TRACE_EVENT(cma_release,
  __entry->count)
 );
 
+TRACE_EVENT(cma_alloc_range_start,
+
+   TP_PROTO(const char *name, unsigned long start, unsigned long count,
+unsigned tries),
+
+   TP_ARGS(name, start, count, tries),
+
+   TP_STRUCT__entry(
+   __string(name, name)
+   __field(unsigned long, start)
+   __field(unsigned long, count)
+   __field(unsigned, tries)
+   ),
+
+   TP_fast_assign(
+   __assign_str(name, name);
+   __entry->start = start;
+   __entry->count = count;
+   __entry->tries = tries;
+   ),
+
+   TP_printk("name=%s start=%lx count=%lu tries=%u",
+ __get_str(name),
+ __entry->start,
+ __entry->count,
+ __entry->tries)
+);
+
+TRACE_EVENT(cma_alloc_range_finish,
+
+   TP_PROTO(const char *name, unsigned long start, unsigned long count,
+unsigned attempts, int err),
+
+   TP_ARGS(name, start, count, attempts, err),
+
+   TP_STRUCT__entry(
+   __string(name, name)
+   __field(unsigned long, start)
+   __field(unsigned long, count)
+   __field(unsigned, attempts)
+   __field(int, err)
+   ),
+
+   TP_fast_assign(
+   __assign_str(name, name);
+   __entry->start = start;
+   __entry->count = count;
+   __entry->attempts = attempts;
+   __entry->err = err;
+   ),
+
+   TP_printk("name=%s start=%lx count=%lu attempts=%u err=%d",
+ __get_str(name),
+ __entry->start,
+ __entry->count,
+ __entry->attempts,
+ __entry->err)
+);
+
 TRACE_EVENT(cma_alloc_start,
 
TP_PROTO(const char *name, unsigned long count, unsigned int align),
diff --git a/mm/cma.c b/mm/cma.c
index 543bb6b3be8e..4a0f68b9443b 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -416,6 +416,92 @@ static void cma_debug_show_areas(struct cma *cma)
 static inline void cma_debug_show_areas(struct cma *cma) { }
 #endif
 
+/**
+ * cma_alloc_range() - allocate pages in a specific range
+ * @cma:   Contiguous memory region for which the allocation is performed.
+ * @start: Starting pfn of the allocation.
+ * @count: Requested number of pages
+ * @tries: Number of tries if the range is busy
+ * @no_warn: Avoid printing message about failed allocation
+ *
+ * This function allocates part of contiguous memory from a specific contiguous
+ * memory area, from the specified starting address. The 'start' pfn and the 
the
+ * 'count' number of pages must be aligned to the CMA bitmap order per bit.
+ */
+int cma_alloc_range(struct cma *cma, unsigned long start, unsigned long count,
+   unsigned tries, gfp_t gfp)
+{
+   unsigned long bitmap_maxno, bitmap_no, bitmap_start, bitmap_count;
+   unsigned long i = 0;
+   struct page *page;
+   int err = -EINVAL;
+
+   

[PATCH RFC v3 07/35] mm: cma: Add CMA_RELEASE_{SUCCESS,FAIL} events

2024-01-25 Thread Alexandru Elisei
Similar to the two events that relate to CMA allocations, add the
CMA_RELEASE_SUCCESS and CMA_RELEASE_FAIL events that count when CMA pages
are freed.

Signed-off-by: Alexandru Elisei 
---

Changes since rfc v2:

* New patch.

 include/linux/vm_event_item.h | 2 ++
 mm/cma.c  | 6 +-
 mm/vmstat.c   | 2 ++
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 747943bc8cc2..aba5c5bf8127 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -83,6 +83,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 #ifdef CONFIG_CMA
CMA_ALLOC_SUCCESS,
CMA_ALLOC_FAIL,
+   CMA_RELEASE_SUCCESS,
+   CMA_RELEASE_FAIL,
 #endif
UNEVICTABLE_PGCULLED,   /* culled to noreclaim list */
UNEVICTABLE_PGSCANNED,  /* scanned for reclaimability */
diff --git a/mm/cma.c b/mm/cma.c
index dbf7fe8cb1bd..543bb6b3be8e 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -562,8 +562,10 @@ bool cma_release(struct cma *cma, const struct page *pages,
 {
unsigned long pfn;
 
-   if (!cma_pages_valid(cma, pages, count))
+   if (!cma_pages_valid(cma, pages, count)) {
+   count_vm_events(CMA_RELEASE_FAIL, count);
return false;
+   }
 
pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count);
 
@@ -575,6 +577,8 @@ bool cma_release(struct cma *cma, const struct page *pages,
cma_clear_bitmap(cma, pfn, count);
trace_cma_release(cma->name, pfn, pages, count);
 
+   count_vm_events(CMA_RELEASE_SUCCESS, count);
+
return true;
 }
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index db79935e4a54..eebfd5c6c723 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1340,6 +1340,8 @@ const char * const vmstat_text[] = {
 #ifdef CONFIG_CMA
"cma_alloc_success",
"cma_alloc_fail",
+   "cma_release_success",
+   "cma_release_fail",
 #endif
"unevictable_pgs_culled",
"unevictable_pgs_scanned",
-- 
2.43.0




[PATCH RFC v3 06/35] mm: cma: Make CMA_ALLOC_SUCCESS/FAIL count the number of pages

2024-01-25 Thread Alexandru Elisei
The CMA_ALLOC_SUCCESS, respectively CMA_ALLOC_FAIL, are increased by one
after each cma_alloc() function call. This is done even though cma_alloc()
can allocate an arbitrary number of CMA pages. When looking at
/proc/vmstat, the number of successful (or failed) cma_alloc() calls
doesn't tell much with regards to how many CMA pages were allocated via
cma_alloc() versus via the page allocator (regular allocation request or
PCP lists refill).

This can also be rather confusing to a user who isn't familiar with the
code, since the unit of measurement for nr_free_cma is the number of pages,
but cma_alloc_success and cma_alloc_fail count the number of cma_alloc()
function calls.

Let's make this consistent, and arguably more useful, by having
CMA_ALLOC_SUCCESS count the number of successfully allocated CMA pages, and
CMA_ALLOC_FAIL count the number of pages the cma_alloc() failed to
allocate.

For users that wish to track the number of cma_alloc() calls, there are
tracepoints for that already implemented.

Signed-off-by: Alexandru Elisei 
---
 mm/cma.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/cma.c b/mm/cma.c
index f49c95f8ee37..dbf7fe8cb1bd 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -517,10 +517,10 @@ struct page *cma_alloc(struct cma *cma, unsigned long 
count,
pr_debug("%s(): returned %p\n", __func__, page);
 out:
if (page) {
-   count_vm_event(CMA_ALLOC_SUCCESS);
+   count_vm_events(CMA_ALLOC_SUCCESS, count);
cma_sysfs_account_success_pages(cma, count);
} else {
-   count_vm_event(CMA_ALLOC_FAIL);
+   count_vm_events(CMA_ALLOC_FAIL, count);
if (cma)
cma_sysfs_account_fail_pages(cma, count);
}
-- 
2.43.0




[PATCH RFC v3 05/35] mm: cma: Don't append newline when generating CMA area name

2024-01-25 Thread Alexandru Elisei
cma->name is displayed in several CMA messages. When the name is generated
by the CMA code, don't append a newline to avoid breaking the text across
two lines.

Signed-off-by: Alexandru Elisei 
---

Changes since rfc v2:

* New patch. This is a fix, and can be merged independently of the other
patches.

 mm/cma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/cma.c b/mm/cma.c
index 7c09c47e530b..f49c95f8ee37 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -204,7 +204,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, 
phys_addr_t size,
if (name)
snprintf(cma->name, CMA_MAX_NAME, name);
else
-   snprintf(cma->name, CMA_MAX_NAME,  "cma%d\n", cma_area_count);
+   snprintf(cma->name, CMA_MAX_NAME,  "cma%d", cma_area_count);
 
cma->base_pfn = PFN_DOWN(base);
cma->count = size >> PAGE_SHIFT;
-- 
2.43.0




[PATCH RFC v3 04/35] mm: page_alloc: Partially revert "mm: page_alloc: remove stale CMA guard code"

2024-01-25 Thread Alexandru Elisei
The patch f945116e4e19 ("mm: page_alloc: remove stale CMA guard code")
removed the CMA filter when allocating from the MIGRATE_MOVABLE pcp list
because CMA is always allowed when __GFP_MOVABLE is set.

With the introduction of the arch_alloc_cma() function, the above is not
true anymore, so bring back the filter.

This is a partially revert because the stale comment remains removed.

Signed-off-by: Alexandru Elisei 
---
 mm/page_alloc.c | 15 +++
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a96d47a6393e..0fa34bcfb1af 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2897,10 +2897,17 @@ struct page *rmqueue(struct zone *preferred_zone,
WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
 
if (likely(pcp_allowed_order(order))) {
-   page = rmqueue_pcplist(preferred_zone, zone, order,
-  migratetype, alloc_flags);
-   if (likely(page))
-   goto out;
+   /*
+* MIGRATE_MOVABLE pcplist could have the pages on CMA area and
+* we need to skip it when CMA area isn't allowed.
+*/
+   if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA ||
+   migratetype != MIGRATE_MOVABLE) {
+   page = rmqueue_pcplist(preferred_zone, zone, order,
+   migratetype, alloc_flags);
+   if (likely(page))
+   goto out;
+   }
}
 
page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags,
-- 
2.43.0




[PATCH RFC v3 03/35] mm: page_alloc: Add an arch hook to filter MIGRATE_CMA allocations

2024-01-25 Thread Alexandru Elisei
As an architecture might have specific requirements around the allocation
of CMA pages, add an arch hook that can disable allocations from
MIGRATE_CMA, if the allocation was otherwise allowed.

This will be used by arm64, which will put tag storage pages on the
MIGRATE_CMA list, and tag storage pages cannot be tagged. The filter will
be used to deny using MIGRATE_CMA for __GFP_TAGGED allocations.

Signed-off-by: Alexandru Elisei 
---
 include/linux/pgtable.h | 7 +++
 mm/page_alloc.c | 3 ++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 6d98d5fdd697..c5ddec6b5305 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -905,6 +905,13 @@ static inline void arch_do_swap_page(struct mm_struct *mm,
 static inline void arch_free_pages_prepare(struct page *page, int order) { }
 #endif
 
+#ifndef __HAVE_ARCH_ALLOC_CMA
+static inline bool arch_alloc_cma(gfp_t gfp)
+{
+   return true;
+}
+#endif
+
 #ifndef __HAVE_ARCH_UNMAP_ONE
 /*
  * Some architectures support metadata associated with a page. When a
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 27282a1c82fe..a96d47a6393e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3157,7 +3157,8 @@ static inline unsigned int gfp_to_alloc_flags_cma(gfp_t 
gfp_mask,
  unsigned int alloc_flags)
 {
 #ifdef CONFIG_CMA
-   if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+   if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE &&
+   arch_alloc_cma(gfp_mask))
alloc_flags |= ALLOC_CMA;
 #endif
return alloc_flags;
-- 
2.43.0




[PATCH RFC v3 02/35] mm: page_alloc: Add an arch hook early in free_pages_prepare()

2024-01-25 Thread Alexandru Elisei
The arm64 MTE code uses the PG_arch_2 page flag, which it renames to
PG_mte_tagged, to track if a page has been mapped with tagging enabled.
That flag is cleared by free_pages_prepare() by doing:

page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;

When tag storage management is added, tag storage will be reserved for a
page if and only if the page is mapped as tagged (the page flag
PG_mte_tagged is set). When a page is freed, likewise, the code will have
to look at the the page flags to determine if the page has tag storage
reserved, which should also be freed.

For this purpose, add an arch_free_pages_prepare() hook that is called
before that page flags are cleared. The function arch_free_page() has also
been considered for this purpose, but it is called after the flags are
cleared.

Signed-off-by: Alexandru Elisei 
---

Changes since rfc v2:

* Expanded commit message (David Hildenbrand).

 include/linux/pgtable.h | 4 
 mm/page_alloc.c | 1 +
 2 files changed, 5 insertions(+)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index f6d0e3513948..6d98d5fdd697 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -901,6 +901,10 @@ static inline void arch_do_swap_page(struct mm_struct *mm,
 }
 #endif
 
+#ifndef __HAVE_ARCH_FREE_PAGES_PREPARE
+static inline void arch_free_pages_prepare(struct page *page, int order) { }
+#endif
+
 #ifndef __HAVE_ARCH_UNMAP_ONE
 /*
  * Some architectures support metadata associated with a page. When a
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2c140abe5ee6..27282a1c82fe 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1092,6 +1092,7 @@ static __always_inline bool free_pages_prepare(struct 
page *page,
 
trace_mm_page_free(page, order);
kmsan_free_page(page, order);
+   arch_free_pages_prepare(page, order);
 
if (memcg_kmem_online() && PageMemcgKmem(page))
__memcg_kmem_uncharge_page(page, order);
-- 
2.43.0




[PATCH RFC v3 01/35] mm: page_alloc: Add gfp_flags parameter to arch_alloc_page()

2024-01-25 Thread Alexandru Elisei
Extend the usefulness of arch_alloc_page() by adding the gfp_flags
parameter.

Signed-off-by: Alexandru Elisei 
---

Changes since rfc v2:

* New patch.

 arch/s390/include/asm/page.h | 2 +-
 arch/s390/mm/page-states.c   | 2 +-
 include/linux/gfp.h  | 2 +-
 mm/page_alloc.c  | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index 73b9c3bf377f..859f0958c574 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -163,7 +163,7 @@ static inline int page_reset_referenced(unsigned long addr)
 
 struct page;
 void arch_free_page(struct page *page, int order);
-void arch_alloc_page(struct page *page, int order);
+void arch_alloc_page(struct page *page, int order, gfp_t gfp_flags);
 
 static inline int devmem_is_allowed(unsigned long pfn)
 {
diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c
index 01f9b39e65f5..b986c8b158e3 100644
--- a/arch/s390/mm/page-states.c
+++ b/arch/s390/mm/page-states.c
@@ -21,7 +21,7 @@ void arch_free_page(struct page *page, int order)
__set_page_unused(page_to_virt(page), 1UL << order);
 }
 
-void arch_alloc_page(struct page *page, int order)
+void arch_alloc_page(struct page *page, int order, gfp_t gfp_flags)
 {
if (!cmma_flag)
return;
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index de292a007138..9e8aa3d144db 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -172,7 +172,7 @@ static inline struct zonelist *node_zonelist(int nid, gfp_t 
flags)
 static inline void arch_free_page(struct page *page, int order) { }
 #endif
 #ifndef HAVE_ARCH_ALLOC_PAGE
-static inline void arch_alloc_page(struct page *page, int order) { }
+static inline void arch_alloc_page(struct page *page, int order, gfp_t 
gfp_flags) { }
 #endif
 
 struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 150d4f23b010..2c140abe5ee6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1485,7 +1485,7 @@ inline void post_alloc_hook(struct page *page, unsigned 
int order,
set_page_private(page, 0);
set_page_refcounted(page);
 
-   arch_alloc_page(page, order);
+   arch_alloc_page(page, order, gfp_flags);
debug_pagealloc_map_pages(page, 1 << order);
 
/*
-- 
2.43.0




[PATCH RFC v3 00/35] Add support for arm64 MTE dynamic tag storage reuse

2024-01-25 Thread Alexandru Elisei
rent page if it
becomes PROT_MTE (mprotect()) and the range it is in does not support
tagging.

Some other complications are arm64-specific like cache coherency between
tags and data accesses. There is a draft architecture spec which will be
released soon, detailing how the hardware behaves.

All of this will be entirely transparent to userspace. As with the current
kernel (without this dynamic tag storage), a user only needs to ask for
PROT_MTE mappings to get tagged pages.


Implementation
==

MTE tag storage reuse is accomplished with the following changes to the
Linux kernel:

1. The tag storage memory is exposed to the memory allocator as
MIGRATE_CMA. The arm64 uses the newly added function cma_alloc_range() to
reserve tag storage when the associated page is allocated as tagged.

There is a limitation to this approach: all MIGRATE_CMA memory cannot be
used for tagged allocations, even if not all of it is tag storage.

2. mprotect(PROT_MTE) is implemented by adding a fault-on-access mechanism
for existing pages. When a page is next accessed, a fault is taken and the
corresponding tag storage is reserved.

3. When the code tries to copy tags to a page (when swapping in a newly
allocated page, or during migration/THP collapse) which doesn't have the
tag storage reserved, the tags are copied to an xarray and restored when
tag storage is reserved for the destination page.

4. KVM allows VMAs without MTE enabled to represent the memory of a virtual
machine with MTE enabled. Even though the host treats the pages that
represent guest memory as untagged, they have tags associated with them,
which are used by the guest. To make dynamic tag storage work with KVM, two
changes were necessary: try to reserve tag storage when a guest accesses an
address the first time, and if not possible, migrate the page to replace it
with a page with tag storage reserved; and a new VMA flag, VM_MTE_KVM, was
added so the page allocator will not use tag storage pages (which cannot be
tagged) for VM memory. The second change is a performance optimization.


Testing
===

To enable MTE dynamic tag storage:

- CONFIG_ARM64_MTE_TAG_STORAGE=y
- system_supports_mte() returns true
- kasan_hw_tags_enabled() returns false
- correct DTB node. For an example that works with FVP, have a look at
patch #35 ("HACK! Add fake tag storage to fvp-base-revc.dts")

Check dmesg for the message "MTE tag storage region management enabled".

Alexandru Elisei (35):
  mm: page_alloc: Add gfp_flags parameter to arch_alloc_page()
  mm: page_alloc: Add an arch hook early in free_pages_prepare()
  mm: page_alloc: Add an arch hook to filter MIGRATE_CMA allocations
  mm: page_alloc: Partially revert "mm: page_alloc: remove stale CMA
guard code"
  mm: cma: Don't append newline when generating CMA area name
  mm: cma: Make CMA_ALLOC_SUCCESS/FAIL count the number of pages
  mm: cma: Add CMA_RELEASE_{SUCCESS,FAIL} events
  mm: cma: Introduce cma_alloc_range()
  mm: cma: Introduce cma_remove_mem()
  mm: cma: Fast track allocating memory when the pages are free
  mm: Allow an arch to hook into folio allocation when VMA is known
  mm: Call arch_swap_prepare_to_restore() before arch_swap_restore()
  mm: memory: Introduce fault-on-access mechanism for pages
  of: fdt: Return the region size in of_flat_dt_translate_address()
  of: fdt: Add of_flat_read_u32()
  KVM: arm64: Don't deny VM_PFNMAP VMAs when kvm_has_mte()
  arm64: mte: Rework naming for tag manipulation functions
  arm64: mte: Rename __GFP_ZEROTAGS to __GFP_TAGGED
  arm64: mte: Discover tag storage memory
  arm64: mte: Add tag storage memory to CMA
  arm64: mte: Disable dynamic tag storage management if HW KASAN is
enabled
  arm64: mte: Enable tag storage if CMA areas have been activated
  arm64: mte: Try to reserve tag storage in arch_alloc_page()
  arm64: mte: Perform CMOs for tag blocks
  arm64: mte: Reserve tag block for the zero page
  arm64: mte: Use fault-on-access to reserve missing tag storage
  arm64: mte: Handle tag storage pages mapped in an MTE VMA
  arm64: mte: swap: Handle tag restoring when missing tag storage
  arm64: mte: copypage: Handle tag restoring when missing tag storage
  arm64: mte: ptrace: Handle pages with missing tag storage
  khugepaged: arm64: Don't collapse MTE enabled VMAs
  KVM: arm64: mte: Reserve tag storage for virtual machines with MTE
  KVM: arm64: mte: Introduce VM_MTE_KVM VMA flag
  arm64: mte: Enable dynamic tag storage management
  HACK! Add fake tag storage to fvp-base-revc.dts

 .../reserved-memory/arm,mte-tag-storage.yaml  |  78 +++
 arch/arm64/Kconfig|  14 +
 arch/arm64/boot/dts/arm/fvp-base-revc.dts |  42 +-
 arch/arm64/include/asm/assembler.h|  10 +
 arch/arm64/include/asm/mte-def.h  |  16 +-
 arch/arm64/include/asm/mte.h  |  43 +-
 arch/arm64/include/asm/mte_tag_storage.h  |  83 +++
 arch/arm64/include/asm/page.h |

Re: [PATCH RFC v2 11/27] arm64: mte: Reserve tag storage memory

2023-12-18 Thread Alexandru Elisei
Hi,

On Thu, Dec 14, 2023 at 12:55:14PM -0600, Rob Herring wrote:
> On Thu, Dec 14, 2023 at 9:45 AM Alexandru Elisei
>  wrote:
> >
> > Hi,
> >
> > On Wed, Dec 13, 2023 at 02:30:42PM -0600, Rob Herring wrote:
> > > On Wed, Dec 13, 2023 at 11:44 AM Alexandru Elisei
> > >  wrote:
> > > >
> > > > On Wed, Dec 13, 2023 at 11:22:17AM -0600, Rob Herring wrote:
> > > > > On Wed, Dec 13, 2023 at 8:51 AM Alexandru Elisei
> > > > >  wrote:
> > > > > >
> > > > > > Hi,
> > > > > >
> > > > > > On Wed, Dec 13, 2023 at 08:06:44AM -0600, Rob Herring wrote:
> > > > > > > On Wed, Dec 13, 2023 at 7:05 AM Alexandru Elisei
> > > > > > >  wrote:
> > > > > > > >
> > > > > > > > Hi Rob,
> > > > > > > >
> > > > > > > > On Tue, Dec 12, 2023 at 12:44:06PM -0600, Rob Herring wrote:
> > > > > > > > > On Tue, Dec 12, 2023 at 10:38 AM Alexandru Elisei
> > > > > > > > >  wrote:
> > > > > > > > > >
> > > > > > > > > > Hi Rob,
> > > > > > > > > >
> > > > > > > > > > Thank you so much for the feedback, I'm not very familiar 
> > > > > > > > > > with device tree,
> > > > > > > > > > and any comments are very useful.
> > > > > > > > > >
> > > > > > > > > > On Mon, Dec 11, 2023 at 11:29:40AM -0600, Rob Herring wrote:
> > > > > > > > > > > On Sun, Nov 19, 2023 at 10:59 AM Alexandru Elisei
> > > > > > > > > > >  wrote:
> > > > > > > > > > > >
> > > > > > > > > > > > Allow the kernel to get the size and location of the 
> > > > > > > > > > > > MTE tag storage
> > > > > > > > > > > > regions from the DTB. This memory is marked as reserved 
> > > > > > > > > > > > for now.
> > > > > > > > > > > >
> > > > > > > > > > > > The DTB node for the tag storage region is defined as:
> > > > > > > > > > > >
> > > > > > > > > > > > tags0: tag-storage@8f800 {
> > > > > > > > > > > > compatible = "arm,mte-tag-storage";
> > > > > > > > > > > > reg = <0x08 0xf800 0x00 0x400>;
> > > > > > > > > > > > block-size = <0x1000>;
> > > > > > > > > > > > memory = <>;// Associated 
> > > > > > > > > > > > tagged memory node
> > > > > > > > > > > > };
> > > > > > > > > > >
> > > > > > > > > > > I skimmed thru the discussion some. If this memory range 
> > > > > > > > > > > is within
> > > > > > > > > > > main RAM, then it definitely belongs in /reserved-memory.
> > > > > > > > > >
> > > > > > > > > > Ok, will do that.
> > > > > > > > > >
> > > > > > > > > > If you don't mind, why do you say that it definitely 
> > > > > > > > > > belongs in
> > > > > > > > > > reserved-memory? I'm not trying to argue otherwise, I'm 
> > > > > > > > > > curious about the
> > > > > > > > > > motivation.
> > > > > > > > >
> > > > > > > > > Simply so that /memory nodes describe all possible memory and
> > > > > > > > > /reserved-memory is just adding restrictions. It's also 
> > > > > > > > > because
> > > > > > > > > /reserved-memory is what gets handled early, and we don't need
> > > > > > > > > multiple things to handle early.
> > > > > > > > >
> > > > > > > > > > Tag storage is not DMA and can live anywhere in memory.
> > > > > > > > >
> > > > > > 

Re: [PATCH RFC v2 11/27] arm64: mte: Reserve tag storage memory

2023-12-14 Thread Alexandru Elisei
Hi,

On Wed, Dec 13, 2023 at 02:30:42PM -0600, Rob Herring wrote:
> On Wed, Dec 13, 2023 at 11:44 AM Alexandru Elisei
>  wrote:
> >
> > On Wed, Dec 13, 2023 at 11:22:17AM -0600, Rob Herring wrote:
> > > On Wed, Dec 13, 2023 at 8:51 AM Alexandru Elisei
> > >  wrote:
> > > >
> > > > Hi,
> > > >
> > > > On Wed, Dec 13, 2023 at 08:06:44AM -0600, Rob Herring wrote:
> > > > > On Wed, Dec 13, 2023 at 7:05 AM Alexandru Elisei
> > > > >  wrote:
> > > > > >
> > > > > > Hi Rob,
> > > > > >
> > > > > > On Tue, Dec 12, 2023 at 12:44:06PM -0600, Rob Herring wrote:
> > > > > > > On Tue, Dec 12, 2023 at 10:38 AM Alexandru Elisei
> > > > > > >  wrote:
> > > > > > > >
> > > > > > > > Hi Rob,
> > > > > > > >
> > > > > > > > Thank you so much for the feedback, I'm not very familiar with 
> > > > > > > > device tree,
> > > > > > > > and any comments are very useful.
> > > > > > > >
> > > > > > > > On Mon, Dec 11, 2023 at 11:29:40AM -0600, Rob Herring wrote:
> > > > > > > > > On Sun, Nov 19, 2023 at 10:59 AM Alexandru Elisei
> > > > > > > > >  wrote:
> > > > > > > > > >
> > > > > > > > > > Allow the kernel to get the size and location of the MTE 
> > > > > > > > > > tag storage
> > > > > > > > > > regions from the DTB. This memory is marked as reserved for 
> > > > > > > > > > now.
> > > > > > > > > >
> > > > > > > > > > The DTB node for the tag storage region is defined as:
> > > > > > > > > >
> > > > > > > > > > tags0: tag-storage@8f800 {
> > > > > > > > > > compatible = "arm,mte-tag-storage";
> > > > > > > > > > reg = <0x08 0xf800 0x00 0x400>;
> > > > > > > > > > block-size = <0x1000>;
> > > > > > > > > > memory = <>;// Associated 
> > > > > > > > > > tagged memory node
> > > > > > > > > > };
> > > > > > > > >
> > > > > > > > > I skimmed thru the discussion some. If this memory range is 
> > > > > > > > > within
> > > > > > > > > main RAM, then it definitely belongs in /reserved-memory.
> > > > > > > >
> > > > > > > > Ok, will do that.
> > > > > > > >
> > > > > > > > If you don't mind, why do you say that it definitely belongs in
> > > > > > > > reserved-memory? I'm not trying to argue otherwise, I'm curious 
> > > > > > > > about the
> > > > > > > > motivation.
> > > > > > >
> > > > > > > Simply so that /memory nodes describe all possible memory and
> > > > > > > /reserved-memory is just adding restrictions. It's also because
> > > > > > > /reserved-memory is what gets handled early, and we don't need
> > > > > > > multiple things to handle early.
> > > > > > >
> > > > > > > > Tag storage is not DMA and can live anywhere in memory.
> > > > > > >
> > > > > > > Then why put it in DT at all? The only reason CMA is there is to 
> > > > > > > set
> > > > > > > the size. It's not even clear to me we need CMA in DT either. The
> > > > > > > reasoning long ago was the kernel didn't do a good job of moving 
> > > > > > > and
> > > > > > > reclaiming contiguous space, but that's supposed to be better now 
> > > > > > > (and
> > > > > > > most h/w figured out they need IOMMUs).
> > > > > > >
> > > > > > > But for tag storage you know the size as it is a function of the
> > > > > > > memory size, right? After all, you are validating the size is 
> > > > > > > correct.
> > > > > > > I guess there is 

Re: [PATCH RFC v2 11/27] arm64: mte: Reserve tag storage memory

2023-12-13 Thread Alexandru Elisei
On Wed, Dec 13, 2023 at 11:22:17AM -0600, Rob Herring wrote:
> On Wed, Dec 13, 2023 at 8:51 AM Alexandru Elisei
>  wrote:
> >
> > Hi,
> >
> > On Wed, Dec 13, 2023 at 08:06:44AM -0600, Rob Herring wrote:
> > > On Wed, Dec 13, 2023 at 7:05 AM Alexandru Elisei
> > >  wrote:
> > > >
> > > > Hi Rob,
> > > >
> > > > On Tue, Dec 12, 2023 at 12:44:06PM -0600, Rob Herring wrote:
> > > > > On Tue, Dec 12, 2023 at 10:38 AM Alexandru Elisei
> > > > >  wrote:
> > > > > >
> > > > > > Hi Rob,
> > > > > >
> > > > > > Thank you so much for the feedback, I'm not very familiar with 
> > > > > > device tree,
> > > > > > and any comments are very useful.
> > > > > >
> > > > > > On Mon, Dec 11, 2023 at 11:29:40AM -0600, Rob Herring wrote:
> > > > > > > On Sun, Nov 19, 2023 at 10:59 AM Alexandru Elisei
> > > > > > >  wrote:
> > > > > > > >
> > > > > > > > Allow the kernel to get the size and location of the MTE tag 
> > > > > > > > storage
> > > > > > > > regions from the DTB. This memory is marked as reserved for now.
> > > > > > > >
> > > > > > > > The DTB node for the tag storage region is defined as:
> > > > > > > >
> > > > > > > > tags0: tag-storage@8f800 {
> > > > > > > > compatible = "arm,mte-tag-storage";
> > > > > > > > reg = <0x08 0xf800 0x00 0x400>;
> > > > > > > > block-size = <0x1000>;
> > > > > > > > memory = <>;// Associated tagged 
> > > > > > > > memory node
> > > > > > > > };
> > > > > > >
> > > > > > > I skimmed thru the discussion some. If this memory range is within
> > > > > > > main RAM, then it definitely belongs in /reserved-memory.
> > > > > >
> > > > > > Ok, will do that.
> > > > > >
> > > > > > If you don't mind, why do you say that it definitely belongs in
> > > > > > reserved-memory? I'm not trying to argue otherwise, I'm curious 
> > > > > > about the
> > > > > > motivation.
> > > > >
> > > > > Simply so that /memory nodes describe all possible memory and
> > > > > /reserved-memory is just adding restrictions. It's also because
> > > > > /reserved-memory is what gets handled early, and we don't need
> > > > > multiple things to handle early.
> > > > >
> > > > > > Tag storage is not DMA and can live anywhere in memory.
> > > > >
> > > > > Then why put it in DT at all? The only reason CMA is there is to set
> > > > > the size. It's not even clear to me we need CMA in DT either. The
> > > > > reasoning long ago was the kernel didn't do a good job of moving and
> > > > > reclaiming contiguous space, but that's supposed to be better now (and
> > > > > most h/w figured out they need IOMMUs).
> > > > >
> > > > > But for tag storage you know the size as it is a function of the
> > > > > memory size, right? After all, you are validating the size is correct.
> > > > > I guess there is still the aspect of whether you want enable MTE or
> > > > > not which could be done in a variety of ways.
> > > >
> > > > Oh, sorry, my bad, I should have been clearer about this. I don't want 
> > > > to
> > > > put it in the DT as a "linux,cma" node. But I want it to be managed by 
> > > > CMA.
> > >
> > > Yes, I understand, but my point remains. Why do you need this in DT?
> > > If the location doesn't matter and you can calculate the size from the
> > > memory size, what else is there to add to the DT?
> >
> > I am afraid there has been a misunderstanding. What do you mean by
> > "location doesn't matter"?
> 
> You said:
> > Tag storage is not DMA and can live anywhere in memory.
> 
> Which I took as the kernel can figure out where to put it. But maybe
> you meant the h/w platform can hard code it to be anywhere in memory?
> If so, then yes, DT is needed.

Ah, I see, sorry for not being clear enough, you are correct: tag storage
is a hardware property, and software needs a mechanism (in this case, the
dt) to discover its properties.

> 
> > At the very least, Linux needs to know the address and size of a memory
> > region to use it. The series is about using the tag storage memory for
> > data. Tag storage cannot be described as a regular memory node because it
> > cannot be tagged (and normal memory can).
> 
> If the tag storage lives in the middle of memory, then it would be
> described in the memory node, but removed by being in reserved-memory
> node.

I don't follow. Would you mind going into more details?

> 
> > Then there's the matter of the tag storage block size (explained in this
> > commit message), and also knowing the memory range for which a tag storage
> > region stores the tags. This is explained in the cover letter.
> 
> Honestly, I just forgot about that part.

I totally understand, there are a lot of things to consider at the same
time.

Thanks,
Alex



Re: [PATCH RFC v2 11/27] arm64: mte: Reserve tag storage memory

2023-12-13 Thread Alexandru Elisei
Hi,

On Wed, Dec 13, 2023 at 08:06:44AM -0600, Rob Herring wrote:
> On Wed, Dec 13, 2023 at 7:05 AM Alexandru Elisei
>  wrote:
> >
> > Hi Rob,
> >
> > On Tue, Dec 12, 2023 at 12:44:06PM -0600, Rob Herring wrote:
> > > On Tue, Dec 12, 2023 at 10:38 AM Alexandru Elisei
> > >  wrote:
> > > >
> > > > Hi Rob,
> > > >
> > > > Thank you so much for the feedback, I'm not very familiar with device 
> > > > tree,
> > > > and any comments are very useful.
> > > >
> > > > On Mon, Dec 11, 2023 at 11:29:40AM -0600, Rob Herring wrote:
> > > > > On Sun, Nov 19, 2023 at 10:59 AM Alexandru Elisei
> > > > >  wrote:
> > > > > >
> > > > > > Allow the kernel to get the size and location of the MTE tag storage
> > > > > > regions from the DTB. This memory is marked as reserved for now.
> > > > > >
> > > > > > The DTB node for the tag storage region is defined as:
> > > > > >
> > > > > > tags0: tag-storage@8f800 {
> > > > > > compatible = "arm,mte-tag-storage";
> > > > > > reg = <0x08 0xf800 0x00 0x400>;
> > > > > > block-size = <0x1000>;
> > > > > > memory = <>;// Associated tagged memory 
> > > > > > node
> > > > > > };
> > > > >
> > > > > I skimmed thru the discussion some. If this memory range is within
> > > > > main RAM, then it definitely belongs in /reserved-memory.
> > > >
> > > > Ok, will do that.
> > > >
> > > > If you don't mind, why do you say that it definitely belongs in
> > > > reserved-memory? I'm not trying to argue otherwise, I'm curious about 
> > > > the
> > > > motivation.
> > >
> > > Simply so that /memory nodes describe all possible memory and
> > > /reserved-memory is just adding restrictions. It's also because
> > > /reserved-memory is what gets handled early, and we don't need
> > > multiple things to handle early.
> > >
> > > > Tag storage is not DMA and can live anywhere in memory.
> > >
> > > Then why put it in DT at all? The only reason CMA is there is to set
> > > the size. It's not even clear to me we need CMA in DT either. The
> > > reasoning long ago was the kernel didn't do a good job of moving and
> > > reclaiming contiguous space, but that's supposed to be better now (and
> > > most h/w figured out they need IOMMUs).
> > >
> > > But for tag storage you know the size as it is a function of the
> > > memory size, right? After all, you are validating the size is correct.
> > > I guess there is still the aspect of whether you want enable MTE or
> > > not which could be done in a variety of ways.
> >
> > Oh, sorry, my bad, I should have been clearer about this. I don't want to
> > put it in the DT as a "linux,cma" node. But I want it to be managed by CMA.
> 
> Yes, I understand, but my point remains. Why do you need this in DT?
> If the location doesn't matter and you can calculate the size from the
> memory size, what else is there to add to the DT?

I am afraid there has been a misunderstanding. What do you mean by
"location doesn't matter"?

At the very least, Linux needs to know the address and size of a memory
region to use it. The series is about using the tag storage memory for
data. Tag storage cannot be described as a regular memory node because it
cannot be tagged (and normal memory can).

Then there's the matter of the tag storage block size (explained in this
commit message), and also knowing the memory range for which a tag storage
region stores the tags. This is explained in the cover letter.

Is there something that you feel that is not clear enough? I am more than
happy to go into details.

Thanks,
Alex



Re: [PATCH RFC v2 11/27] arm64: mte: Reserve tag storage memory

2023-12-13 Thread Alexandru Elisei
Hi Rob,

On Tue, Dec 12, 2023 at 12:44:06PM -0600, Rob Herring wrote:
> On Tue, Dec 12, 2023 at 10:38 AM Alexandru Elisei
>  wrote:
> >
> > Hi Rob,
> >
> > Thank you so much for the feedback, I'm not very familiar with device tree,
> > and any comments are very useful.
> >
> > On Mon, Dec 11, 2023 at 11:29:40AM -0600, Rob Herring wrote:
> > > On Sun, Nov 19, 2023 at 10:59 AM Alexandru Elisei
> > >  wrote:
> > > >
> > > > Allow the kernel to get the size and location of the MTE tag storage
> > > > regions from the DTB. This memory is marked as reserved for now.
> > > >
> > > > The DTB node for the tag storage region is defined as:
> > > >
> > > > tags0: tag-storage@8f800 {
> > > > compatible = "arm,mte-tag-storage";
> > > > reg = <0x08 0xf800 0x00 0x400>;
> > > > block-size = <0x1000>;
> > > > memory = <>;// Associated tagged memory node
> > > > };
> > >
> > > I skimmed thru the discussion some. If this memory range is within
> > > main RAM, then it definitely belongs in /reserved-memory.
> >
> > Ok, will do that.
> >
> > If you don't mind, why do you say that it definitely belongs in
> > reserved-memory? I'm not trying to argue otherwise, I'm curious about the
> > motivation.
> 
> Simply so that /memory nodes describe all possible memory and
> /reserved-memory is just adding restrictions. It's also because
> /reserved-memory is what gets handled early, and we don't need
> multiple things to handle early.
> 
> > Tag storage is not DMA and can live anywhere in memory.
> 
> Then why put it in DT at all? The only reason CMA is there is to set
> the size. It's not even clear to me we need CMA in DT either. The
> reasoning long ago was the kernel didn't do a good job of moving and
> reclaiming contiguous space, but that's supposed to be better now (and
> most h/w figured out they need IOMMUs).
> 
> But for tag storage you know the size as it is a function of the
> memory size, right? After all, you are validating the size is correct.
> I guess there is still the aspect of whether you want enable MTE or
> not which could be done in a variety of ways.

Oh, sorry, my bad, I should have been clearer about this. I don't want to
put it in the DT as a "linux,cma" node. But I want it to be managed by CMA.

> 
> > In
> > arm64_memblock_init(), the kernel first removes the memory that it cannot
> > address from memblock. For example, because it has been compiled with
> > CONFIG_ARM64_VA_BITS_39=y. And then calls
> > early_init_fdt_scan_reserved_mem().
> >
> > What happens if reserved memory is above what the kernel can address?
> 
> I would hope the kernel handles it. That's the kernel's problem unless
> there's some h/w limitation to access some region. The DT can't have
> things dependent on the kernel config.

I would hope so too, that's why I was surprised when I put reserved memory
at 1TB in a 39 bit VA kernel and got a panic.

> 
> > From my testing, when the kernel is compiled with 39 bit VA, if I use
> > reserved memory to discover tag storage the lives above the virtua address
> > limit and then I try to use CMA to manage the tag storage memory, I get a
> > kernel panic:
> 
> Looks like we should handle that better...

I guess we don't need to tackle that problem right now. I don't know of
many systems in the wild that have memory above the 1TB address.

> 
> >> [0.00] Reserved memory: created CMA memory pool at 
> >> 0x0100, size 64 MiB
> > [0.00] OF: reserved mem: initialized node linux,cma, compatible id 
> > shared-dma-pool
> > [0.00] OF: reserved mem: 0x0100..0x010003ff 
> > (65536 KiB) map reusable linux,cma
> > [..]
> > [0.806945] Unable to handle kernel paging request at virtual address 
> > 0001fe00
> > [0.807277] Mem abort info:
> > [0.807277]   ESR = 0x9605
> > [0.807693]   EC = 0x25: DABT (current EL), IL = 32 bits
> > [0.808110]   SET = 0, FnV = 0
> > [0.808443]   EA = 0, S1PTW = 0
> > [0.808526]   FSC = 0x05: level 1 translation fault
> > [0.808943] Data abort info:
> > [0.808943]   ISV = 0, ISS = 0x0005, ISS2 = 0x
> > [0.809360]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
> > [0.809776]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
> > [0.810221] [0001fe00] user address but act

Re: [PATCH RFC v2 11/27] arm64: mte: Reserve tag storage memory

2023-12-12 Thread Alexandru Elisei
Hi Rob,

Thank you so much for the feedback, I'm not very familiar with device tree,
and any comments are very useful.

On Mon, Dec 11, 2023 at 11:29:40AM -0600, Rob Herring wrote:
> On Sun, Nov 19, 2023 at 10:59 AM Alexandru Elisei
>  wrote:
> >
> > Allow the kernel to get the size and location of the MTE tag storage
> > regions from the DTB. This memory is marked as reserved for now.
> >
> > The DTB node for the tag storage region is defined as:
> >
> > tags0: tag-storage@8f800 {
> > compatible = "arm,mte-tag-storage";
> > reg = <0x08 0xf800 0x00 0x400>;
> > block-size = <0x1000>;
> > memory = <>;// Associated tagged memory node
> > };
> 
> I skimmed thru the discussion some. If this memory range is within
> main RAM, then it definitely belongs in /reserved-memory.

Ok, will do that.

If you don't mind, why do you say that it definitely belongs in
reserved-memory? I'm not trying to argue otherwise, I'm curious about the
motivation.

Tag storage is not DMA and can live anywhere in memory. In
arm64_memblock_init(), the kernel first removes the memory that it cannot
address from memblock. For example, because it has been compiled with
CONFIG_ARM64_VA_BITS_39=y. And then calls
early_init_fdt_scan_reserved_mem().

What happens if reserved memory is above what the kernel can address?

>From my testing, when the kernel is compiled with 39 bit VA, if I use
reserved memory to discover tag storage the lives above the virtua address
limit and then I try to use CMA to manage the tag storage memory, I get a
kernel panic:

[0.00] Reserved memory: created CMA memory pool at 0x0100, 
size 64 MiB
[0.00] OF: reserved mem: initialized node linux,cma, compatible id 
shared-dma-pool
[0.00] OF: reserved mem: 0x0100..0x010003ff (65536 
KiB) map reusable linux,cma
[..]
[0.806945] Unable to handle kernel paging request at virtual address 
0001fe00
[0.807277] Mem abort info:
[0.807277]   ESR = 0x9605
[0.807693]   EC = 0x25: DABT (current EL), IL = 32 bits
[0.808110]   SET = 0, FnV = 0
[0.808443]   EA = 0, S1PTW = 0
[0.808526]   FSC = 0x05: level 1 translation fault
[0.808943] Data abort info:
[0.808943]   ISV = 0, ISS = 0x0005, ISS2 = 0x
[0.809360]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
[0.809776]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
[0.810221] [0001fe00] user address but active_mm is swapper
[..]
[0.820887] Call trace:
[0.821027]  cma_init_reserved_areas+0xc4/0x378

> 
> You need a binding for this too.

By binding you mean having an yaml file in dt-schem [1] describing the tag
storage node, right?

[1] https://github.com/devicetree-org/dt-schema

> 
> > The tag storage region represents the largest contiguous memory region that
> > holds all the tags for the associated contiguous memory region which can be
> > tagged. For example, for a 32GB contiguous tagged memory the corresponding
> > tag storage region is 1GB of contiguous memory, not two adjacent 512M of
> > tag storage memory.
> >
> > "block-size" represents the minimum multiple of 4K of tag storage where all
> > the tags stored in the block correspond to a contiguous memory region. This
> > is needed for platforms where the memory controller interleaves tag writes
> > to memory. For example, if the memory controller interleaves tag writes for
> > 256KB of contiguous memory across 8K of tag storage (2-way interleave),
> > then the correct value for "block-size" is 0x2000. This value is a hardware
> > property, independent of the selected kernel page size.
> >
> > Signed-off-by: Alexandru Elisei 
> > ---
> >  arch/arm64/Kconfig   |  12 ++
> >  arch/arm64/include/asm/mte_tag_storage.h |  15 ++
> >  arch/arm64/kernel/Makefile   |   1 +
> >  arch/arm64/kernel/mte_tag_storage.c  | 256 +++
> >  arch/arm64/kernel/setup.c|   7 +
> >  5 files changed, 291 insertions(+)
> >  create mode 100644 arch/arm64/include/asm/mte_tag_storage.h
> >  create mode 100644 arch/arm64/kernel/mte_tag_storage.c
> >
> > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> > index 7b071a00425d..fe8276fdc7a8 100644
> > --- a/arch/arm64/Kconfig
> > +++ b/arch/arm64/Kconfig
> > @@ -2062,6 +2062,18 @@ config ARM64_MTE
> >
> >   Documentation/arch/arm64/memory-tagging-extension.rst.
> >
> > +if ARM64_MTE
> > +config ARM64_MTE_TAG_STORAGE
> > +   bool "Dynamic MTE tag storage management"
> &g

Re: [PATCH RFC v2 11/27] arm64: mte: Reserve tag storage memory

2023-12-11 Thread Alexandru Elisei
Hi,

On Fri, Dec 08, 2023 at 02:03:44PM +0900, Hyesoo Yu wrote:
> Hi, 
> 
> I'm sorry for the late response, I was on vacation.
> 
> On Sun, Dec 03, 2023 at 12:14:30PM +0000, Alexandru Elisei wrote:
> > Hi,
> > 
> > On Wed, Nov 29, 2023 at 05:44:24PM +0900, Hyesoo Yu wrote:
> > > Hello.
> > > 
> > > On Sun, Nov 19, 2023 at 04:57:05PM +, Alexandru Elisei wrote:
> > > > Allow the kernel to get the size and location of the MTE tag storage
> > > > regions from the DTB. This memory is marked as reserved for now.
> > > > 
> > > > The DTB node for the tag storage region is defined as:
> > > > 
> > > > tags0: tag-storage@8f800 {
> > > > compatible = "arm,mte-tag-storage";
> > > > reg = <0x08 0xf800 0x00 0x400>;
> > > > block-size = <0x1000>;
> > > > memory = <>;// Associated tagged memory node
> > > > };
> > > >
> > > 
> > > How about using compatible = "shared-dma-pool" like below ?
> > > 
> > > _memory {
> > >   tags0: tag0@8f800 {
> > >   compatible = "arm,mte-tag-storage";
> > >   reg = <0x08 0xf800 0x00 0x400>;
> > >   };
> > > }
> > > 
> > > tag-storage {
> > > compatible = "arm,mte-tag-storage";
> > >   memory-region = <>;
> > > memory = <>;
> > >   block-size = <0x1000>;
> > > }
> > > 
> > > And then, the activation of CMA would be performed in the CMA code.
> > > We just can get the region information from memory-region and allocate it 
> > > directly
> > > like alloc_contig_range, take_page_off_buddy. It seems like we can remove 
> > > a lots of code.
> >
> 
> Sorry, that example was my mistake. Actually I wanted to write like this. 
> 
> _memory {
>   tags0: tag0@8f800 {
>   compatible = "shared-dma-pool";
>   reg = <0x08 0xf800 0x00 0x400>;
>   reusable;
>   };
> }
> 
> tag-storage {
> compatible = "arm,mte-tag-storage";
>   memory-region = <>;
> memory = <>;
>   block-size = <0x1000>;
> }

I prototyped your suggestion with this change to the device tree:

reserved-memory {
#address-cells = <0x02>;
#size-cells = <0x02>;
ranges;

tags0: tag-storage@8f800 {
compatible = "arm,mte-tag-storage";
reg = <0x08 0xf800 0x00 0x400>;
block-size = <0x1000>;
memory = <>;
reusable;
};
};

Would you mind explaining what we are gaining by using reserved mem?

Struct reserved_mem only has the base and size of the tag storage region,
and initialization for reserved mem happens before the DTB is unflattened.
When I prototyped using reserved mem, I still had to write the code to
parse the memory node address and size. This code was the same as the code
needed to parse the tag storage region address and size, so having that
information in struct reserved_mem does not reduce the size of the code by
a meaningful amount.

> 
> 
> > Played with reserved_mem a bit. I don't think that's the correct path
> > forward.
> > 
> > The location of the tag storage is a hardware property, independent of how
> > Linux is configured.
> > 
> > early_init_fdt_scan_reserved_mem() is called from arm64_memblock_init(),
> > **after** the kernel enforces an upper address for various reasons. One of
> > the reasons can be that it's been compiled with 39 bits VA.
> > 
> 
> I'm not sure about this part. What is the upper address enforced by the 
> kernel ?
> Where can I check the code ? Do you means that memblock_end_of_DRAM() ?  

I am referring to arch/arm64/mm/init.c:: arm64_memblock_init(). The
function initializes reserved mem (in early_init_fdt_scan_reserved_mem())
**after**removing memory from memblock that the kernel cannot address.

> 
> > After early_init_fdt_scan_reserved_mem() returns, the kernel sets the
> > maximum address, stored in the variable "high_memory".
> >
> > What can happen is that tag storage is present at an address above the
> > maximum addressable by the kernel, and the CMA co

Re: [PATCH RFC v2 15/27] arm64: mte: Check that tag storage blocks are in the same zone

2023-12-11 Thread Alexandru Elisei
Hi,

On Fri, Dec 08, 2023 at 02:27:39PM +0900, Hyesoo Yu wrote:
> Hi~
> 
> On Thu, Nov 30, 2023 at 12:00:11PM +, Alexandru Elisei wrote:
> > Hi,
> > 
> > On Wed, Nov 29, 2023 at 05:57:44PM +0900, Hyesoo Yu wrote:
> > > On Sun, Nov 19, 2023 at 04:57:09PM +, Alexandru Elisei wrote:
> > > > alloc_contig_range() requires that the requested pages are in the same
> > > > zone. Check that this is indeed the case before initializing the tag
> > > > storage blocks.
> > > > 
> > > > Signed-off-by: Alexandru Elisei 
> > > > ---
> > > >  arch/arm64/kernel/mte_tag_storage.c | 33 +
> > > >  1 file changed, 33 insertions(+)
> > > > 
> > > > diff --git a/arch/arm64/kernel/mte_tag_storage.c 
> > > > b/arch/arm64/kernel/mte_tag_storage.c
> > > > index 8b9bedf7575d..fd63430d4dc0 100644
> > > > --- a/arch/arm64/kernel/mte_tag_storage.c
> > > > +++ b/arch/arm64/kernel/mte_tag_storage.c
> > > > @@ -265,6 +265,35 @@ void __init mte_tag_storage_init(void)
> > > > }
> > > >  }
> > > >  
> > > > +/* alloc_contig_range() requires all pages to be in the same zone. */
> > > > +static int __init mte_tag_storage_check_zone(void)
> > > > +{
> > > > +   struct range *tag_range;
> > > > +   struct zone *zone;
> > > > +   unsigned long pfn;
> > > > +   u32 block_size;
> > > > +   int i, j;
> > > > +
> > > > +   for (i = 0; i < num_tag_regions; i++) {
> > > > +   block_size = tag_regions[i].block_size;
> > > > +   if (block_size == 1)
> > > > +   continue;
> > > > +
> > > > +   tag_range = _regions[i].tag_range;
> > > > +   for (pfn = tag_range->start; pfn <= tag_range->end; pfn 
> > > > += block_size) {
> > > > +   zone = page_zone(pfn_to_page(pfn));
> > > 
> > > Hello.
> > > 
> > > Since the blocks within the tag_range must all be in the same zone, can 
> > > we move the "page_zone"
> > > out of the loop ?
> > `
> > Hmm.. why do you say that the pages in a tag_range must be in the same
> > zone? I am not very familiar with how the memory management code puts pages
> > into zones, but I would imagine that pages in a tag range straddling the
> > 4GB limit (so, let's say, from 3GB to 5GB) will end up in both ZONE_DMA and
> > ZONE_NORMAL.
> > 
> > Thanks,
> > Alex
> > 
> 
> Oh, I see that reserve_tag_storage only calls alloc_contig_rnage in units of 
> block_size,
> I thought it could be called for the entire range the page needed at once.
> (Maybe it could be a bit faster ? It doesn't seem like unnecessary drain and
> other operation are repeated.)

Yes, that might be useful to do. Worth keeping in mind is that:

- a number of block size pages at the start and end of the range might
  already be reserved for other tagged pages, so the actual range that is
  being reserved might end up being smaller that what we are expecting.

- the most common allocation order is smaller or equal to
  PAGE_ALLOC_COSTLY_ORDER, which is 3, which means that the most common
  case is that reserve_tag_storage reserves only one tag storage block.

I will definitely keep this optimization in mind, but I would prefer to get
the series into a more stable shape before looking at performance
optimizations.

> 
> If we use the cma code when activating the tag storage, it will be error if 
> the
> entire area of tag region is not in the same zone, so there should be a 
> constraint
> that it must be in the same zone when defining the tag region on device tree.

I don't think that's the best approach, because the device tree describes
the hardware, which does not change, and this is a software limitation
(i.e, CMA doesn't work if a CMA region spans different zones), which might
get fixed in a future version of Linux.

In my opinion, the simplest solution would be to check that all tag storage
regions have been activated successfully by CMA before enabling tag
storage. Another alternative would be to split the tag storage region into
several CMA regions at a zone boundary, and add it as distinct CMA regions.

Thanks,
Alex

> 
> Thanks,
> Regards.
> 
> > > 
> > > Thanks,
> > > Regards.
> > > 
> > > > +   for (j = 1; j < block_size; j++) {
> > > > +   if (pa

Re: [PATCH RFC v2 11/27] arm64: mte: Reserve tag storage memory

2023-12-03 Thread Alexandru Elisei
Hi,

On Wed, Nov 29, 2023 at 05:44:24PM +0900, Hyesoo Yu wrote:
> Hello.
> 
> On Sun, Nov 19, 2023 at 04:57:05PM +, Alexandru Elisei wrote:
> > Allow the kernel to get the size and location of the MTE tag storage
> > regions from the DTB. This memory is marked as reserved for now.
> > 
> > The DTB node for the tag storage region is defined as:
> > 
> > tags0: tag-storage@8f800 {
> > compatible = "arm,mte-tag-storage";
> > reg = <0x08 0xf800 0x00 0x400>;
> > block-size = <0x1000>;
> > memory = <>;// Associated tagged memory node
> > };
> >
> 
> How about using compatible = "shared-dma-pool" like below ?
> 
> _memory {
>   tags0: tag0@8f800 {
>   compatible = "arm,mte-tag-storage";
>   reg = <0x08 0xf800 0x00 0x400>;
>   };
> }
> 
> tag-storage {
> compatible = "arm,mte-tag-storage";
>   memory-region = <>;
> memory = <>;
>   block-size = <0x1000>;
> }
> 
> And then, the activation of CMA would be performed in the CMA code.
> We just can get the region information from memory-region and allocate it 
> directly
> like alloc_contig_range, take_page_off_buddy. It seems like we can remove a 
> lots of code.

Played with reserved_mem a bit. I don't think that's the correct path
forward.

The location of the tag storage is a hardware property, independent of how
Linux is configured.

early_init_fdt_scan_reserved_mem() is called from arm64_memblock_init(),
**after** the kernel enforces an upper address for various reasons. One of
the reasons can be that it's been compiled with 39 bits VA.

After early_init_fdt_scan_reserved_mem() returns, the kernel sets the
maximum address, stored in the variable "high_memory".

What can happen is that tag storage is present at an address above the
maximum addressable by the kernel, and the CMA code will trigger an
unrecovrable page fault.

I was able to trigger this with the dts change:

diff --git a/arch/arm64/boot/dts/arm/fvp-base-revc.dts 
b/arch/arm64/boot/dts/arm/fvp-base-revc.dts
index 60472d65a355..201359d014e4 100644
--- a/arch/arm64/boot/dts/arm/fvp-base-revc.dts
+++ b/arch/arm64/boot/dts/arm/fvp-base-revc.dts
@@ -183,6 +183,13 @@ vram: vram@1800 {
reg = <0x 0x1800 0 0x0080>;
no-map;
};
+
+
+   linux,cma {
+   compatible = "shared-dma-pool";
+   reg = <0x100 0x0 0x00 0x400>;
+   reusable;
+   };
};

gic: interrupt-controller@2f00 {

And the error I got:

[0.00] Reserved memory: created CMA memory pool at 0x0100, 
size 64 MiB
[0.00] OF: reserved mem: initialized node linux,cma, compatible id 
shared-dma-pool
[0.00] OF: reserved mem: 0x0100..0x010003ff (65536 
KiB) map reusable linux,cma
[..]
[0.793193] WARNING: CPU: 0 PID: 1 at mm/cma.c:111 
cma_init_reserved_areas+0xa8/0x378
[..]
[0.806945] Unable to handle kernel paging request at virtual address 
0001fe00
[0.807277] Mem abort info:
[0.807277]   ESR = 0x9605
[0.807693]   EC = 0x25: DABT (current EL), IL = 32 bits
[0.808110]   SET = 0, FnV = 0
[0.808443]   EA = 0, S1PTW = 0
[0.808526]   FSC = 0x05: level 1 translation fault
[0.808943] Data abort info:
[0.808943]   ISV = 0, ISS = 0x0005, ISS2 = 0x
[0.809360]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
[0.809776]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
[0.810221] [0001fe00] user address but active_mm is swapper
[..]
[0.820887] Call trace:
[0.821027]  cma_init_reserved_areas+0xc4/0x378
[0.821443]  do_one_initcall+0x7c/0x1c0
[0.821860]  kernel_init_freeable+0x1bc/0x284
[0.822277]  kernel_init+0x24/0x1dc
[0.822693]  ret_from_fork+0x10/0x20
[0.823554] Code: 9127a29a cb813321 d37ae421 8b030020 (f8636822)
[0.823554] ---[ end trace  ]---
[0.824360] Kernel panic - not syncing: Attempted to kill init! 
exitcode=0x000b
[0.824443] SMP: stopping secondary CPUs
[0.825193] ---[ end Kernel panic - not syncing: Attempted to kill init! 
exitcode=0x000b ]---

Should reserved mem check if the reserved memory is actually addressable by
the kernel if it's not "no-map"? Should cma fail gracefully if
!pfn_valid(base_pfn)? Shold early_init_fdt_scan_reserved_mem() be moved
because arm64_bootmem_init()? I don't have the answer to any of those. And
I got a kernel panic because the kernel cannot address that memory (39 bits
VA). I don't know what would

Re: [PATCH RFC v2 21/27] mm: arm64: Handle tag storage pages mapped before mprotect(PROT_MTE)

2023-11-30 Thread Alexandru Elisei
Hi Peter,

On Mon, Nov 27, 2023 at 09:39:17PM -0800, Peter Collingbourne wrote:
> Hi Alexandru,
> 
> On Sun, Nov 19, 2023 at 8:59 AM Alexandru Elisei
>  wrote:
> >
> > Signed-off-by: Alexandru Elisei 
> > ---
> >  arch/arm64/include/asm/mte_tag_storage.h |  1 +
> >  arch/arm64/kernel/mte_tag_storage.c  | 15 +++
> >  arch/arm64/mm/fault.c| 55 
> >  include/linux/migrate.h  |  8 +++-
> >  include/linux/migrate_mode.h |  1 +
> >  mm/internal.h|  6 ---
> >  6 files changed, 78 insertions(+), 8 deletions(-)
> >
> > diff --git a/arch/arm64/include/asm/mte_tag_storage.h 
> > b/arch/arm64/include/asm/mte_tag_storage.h
> > index b97406d369ce..6a8b19a6a758 100644
> > --- a/arch/arm64/include/asm/mte_tag_storage.h
> > +++ b/arch/arm64/include/asm/mte_tag_storage.h
> > @@ -33,6 +33,7 @@ int reserve_tag_storage(struct page *page, int order, 
> > gfp_t gfp);
> >  void free_tag_storage(struct page *page, int order);
> >
> >  bool page_tag_storage_reserved(struct page *page);
> > +bool page_is_tag_storage(struct page *page);
> >
> >  vm_fault_t handle_page_missing_tag_storage(struct vm_fault *vmf);
> >  vm_fault_t handle_huge_page_missing_tag_storage(struct vm_fault *vmf);
> > diff --git a/arch/arm64/kernel/mte_tag_storage.c 
> > b/arch/arm64/kernel/mte_tag_storage.c
> > index a1cc239f7211..5096ce859136 100644
> > --- a/arch/arm64/kernel/mte_tag_storage.c
> > +++ b/arch/arm64/kernel/mte_tag_storage.c
> > @@ -500,6 +500,21 @@ bool page_tag_storage_reserved(struct page *page)
> > return test_bit(PG_tag_storage_reserved, >flags);
> >  }
> >
> > +bool page_is_tag_storage(struct page *page)
> > +{
> > +   unsigned long pfn = page_to_pfn(page);
> > +   struct range *tag_range;
> > +   int i;
> > +
> > +   for (i = 0; i < num_tag_regions; i++) {
> > +   tag_range = _regions[i].tag_range;
> > +   if (tag_range->start <= pfn && pfn <= tag_range->end)
> > +   return true;
> > +   }
> > +
> > +   return false;
> > +}
> > +
> >  int reserve_tag_storage(struct page *page, int order, gfp_t gfp)
> >  {
> > unsigned long start_block, end_block;
> > diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
> > index 6730a0812a24..964c5ae161a3 100644
> > --- a/arch/arm64/mm/fault.c
> > +++ b/arch/arm64/mm/fault.c
> > @@ -12,6 +12,7 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >  #include 
> >  #include 
> >  #include 
> > @@ -956,6 +957,50 @@ void tag_clear_highpage(struct page *page)
> >  }
> >
> >  #ifdef CONFIG_ARM64_MTE_TAG_STORAGE
> > +
> > +#define MR_TAGGED_TAG_STORAGE  MR_ARCH_1
> > +
> > +extern bool isolate_lru_page(struct page *page);
> > +extern void putback_movable_pages(struct list_head *l);
> 
> Could we move these declarations to a non-mm-internal header and
> #include it instead of manually declaring them here?

Yes, that's better than this hackish way of doing it.

> 
> > +
> > +/* Returns with the page reference dropped. */
> > +static void migrate_tag_storage_page(struct page *page)
> > +{
> > +   struct migration_target_control mtc = {
> > +   .nid = NUMA_NO_NODE,
> > +   .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_TAGGED,
> > +   };
> > +   unsigned long i, nr_pages = compound_nr(page);
> > +   LIST_HEAD(pagelist);
> > +   int ret, tries;
> > +
> > +   lru_cache_disable();
> > +
> > +   for (i = 0; i < nr_pages; i++) {
> > +   if (!isolate_lru_page(page + i)) {
> > +   ret = -EAGAIN;
> > +   goto out;
> > +   }
> > +   /* Isolate just grabbed another reference, drop ours. */
> > +   put_page(page + i);
> > +   list_add_tail(&(page + i)->lru, );
> > +   }
> > +
> > +   tries = 5;
> > +   while (tries--) {
> > +   ret = migrate_pages(, alloc_migration_target, 
> > NULL, (unsigned long),
> > +   MIGRATE_SYNC, MR_TAGGED_TAG_STORAGE, 
> > NULL);
> > +   if (ret == 0 || ret != -EBUSY)
> 
> This could be simplified to:
> 
> if (ret != -EBUSY)

Indeed! I can do the same thing in reserve_tag_storage(), in the loop where I
call alloc_contig_range().

Thanks,
Alex



Re: [PATCH RFC v2 19/27] mm: mprotect: Introduce PAGE_FAULT_ON_ACCESS for mprotect(PROT_MTE)

2023-11-30 Thread Alexandru Elisei
Hi,

On Thu, Nov 30, 2023 at 01:49:34PM +0100, David Hildenbrand wrote:
> > > > +
> > > > +out_retry:
> > > > +   put_page(page);
> > > > +   if (vmf->flags & FAULT_FLAG_VMA_LOCK)
> > > > +   vma_end_read(vma);
> > > > +   if (fault_flag_allow_retry_first(vmf->flags)) {
> > > > +   err = VM_FAULT_RETRY;
> > > > +   } else {
> > > > +   /* Replay the fault. */
> > > > +   err = 0;
> > > 
> > > Hello!
> > > 
> > > Unfortunately, if the page continues to be pinned, it seems like fault 
> > > will continue to occur.
> > > I guess it makes system stability issue. (but I'm not familiar with that, 
> > > so please let me know if I'm mistaken!)
> > > 
> > > How about migrating the page when migration problem repeats.
> > 
> > Yes, I had the same though in the previous iteration of the series, the
> > page was migrated out of the VMA if tag storage couldn't be reserved.
> > 
> > Only short term pins are allowed on MIGRATE_CMA pages, so I expect that the
> > pin will be released before the fault is replayed. Because of this, and
> > because it makes the code simpler, I chose not to migrate the page if tag
> > storage couldn't be reserved.
> 
> There are still some cases that are theoretically problematic: vmsplice()
> can pin pages forever and doesn't use FOLL_LONGTERM yet.
> 
> All these things also affect other users that rely on movability (e.g., CMA,
> memory hotunplug).

I wasn't aware of that, thank you for the information. Then to ensure that the
process doesn't hang by replying the loop indefinitely, I'll migrate the page if
tag storage cannot be reserved. Looking over the code again, I think I can reuse
the same function that migrates tag storage pages out of the MTE VMA (added in
patch #21), so no major changes needed.

Thanks,
Alex

> 
> -- 
> Cheers,
> 
> David / dhildenb
> 
> 



Re: [PATCH RFC v2 19/27] mm: mprotect: Introduce PAGE_FAULT_ON_ACCESS for mprotect(PROT_MTE)

2023-11-30 Thread Alexandru Elisei
Hi,

On Wed, Nov 29, 2023 at 06:27:25PM +0900, Hyesoo Yu wrote:
> On Sun, Nov 19, 2023 at 04:57:13PM +0000, Alexandru Elisei wrote:
> > To enable tagging on a memory range, userspace can use mprotect() with the
> > PROT_MTE access flag. Pages already mapped in the VMA don't have the
> > associated tag storage block reserved, so mark the PTEs as
> > PAGE_FAULT_ON_ACCESS to trigger a fault next time they are accessed, and
> > reserve the tag storage on the fault path.
> > 
> > This has several benefits over reserving the tag storage as part of the
> > mprotect() call handling:
> > 
> > - Tag storage is reserved only for those pages in the VMA that are
> >   accessed, instead of for all the pages already mapped in the VMA.
> > - Reduces the latency of the mprotect() call.
> > - Eliminates races with page migration.
> > 
> > But all of this is at the expense of an extra page fault per page until the
> > pages being accessed all have their corresponding tag storage reserved.
> > 
> > For arm64, the PAGE_FAULT_ON_ACCESS protection is created by defining a new
> > page table entry software bit, PTE_TAG_STORAGE_NONE. Linux doesn't set any
> > of the PBHA bits in entries from the last level of the translation table
> > and it doesn't use the TCR_ELx.HWUxx bits; also, the first PBHA bit, bit
> > 59, is already being used as a software bit for PMD_PRESENT_INVALID.
> > 
> > This is only implemented for PTE mappings; PMD mappings will follow.
> > 
> > Signed-off-by: Alexandru Elisei 
> > ---
> >  arch/arm64/Kconfig   |   1 +
> >  arch/arm64/include/asm/mte.h |   4 +-
> >  arch/arm64/include/asm/mte_tag_storage.h |   2 +
> >  arch/arm64/include/asm/pgtable-prot.h|   2 +
> >  arch/arm64/include/asm/pgtable.h |  40 ++---
> >  arch/arm64/kernel/mte.c  |  12 ++-
> >  arch/arm64/mm/fault.c| 101 +++
> >  include/linux/pgtable.h  |  17 
> >  mm/Kconfig   |   3 +
> >  mm/memory.c  |   3 +
> >  10 files changed, 170 insertions(+), 15 deletions(-)
> > 
> > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> > index efa5b7958169..3b9c435eaafb 100644
> > --- a/arch/arm64/Kconfig
> > +++ b/arch/arm64/Kconfig
> > @@ -2066,6 +2066,7 @@ if ARM64_MTE
> >  config ARM64_MTE_TAG_STORAGE
> > bool "Dynamic MTE tag storage management"
> > depends on ARCH_KEEP_MEMBLOCK
> > +   select ARCH_HAS_FAULT_ON_ACCESS
> > select CONFIG_CMA
> > help
> >   Adds support for dynamic management of the memory used by the hardware
> > diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h
> > index 6457b7899207..70dc2e409070 100644
> > --- a/arch/arm64/include/asm/mte.h
> > +++ b/arch/arm64/include/asm/mte.h
> > @@ -107,7 +107,7 @@ static inline bool try_page_mte_tagging(struct page 
> > *page)
> >  }
> >  
> >  void mte_zero_clear_page_tags(void *addr);
> > -void mte_sync_tags(pte_t pte, unsigned int nr_pages);
> > +void mte_sync_tags(pte_t *pteval, unsigned int nr_pages);
> >  void mte_copy_page_tags(void *kto, const void *kfrom);
> >  void mte_thread_init_user(void);
> >  void mte_thread_switch(struct task_struct *next);
> > @@ -139,7 +139,7 @@ static inline bool try_page_mte_tagging(struct page 
> > *page)
> >  static inline void mte_zero_clear_page_tags(void *addr)
> >  {
> >  }
> > -static inline void mte_sync_tags(pte_t pte, unsigned int nr_pages)
> > +static inline void mte_sync_tags(pte_t *pteval, unsigned int nr_pages)
> >  {
> >  }
> >  static inline void mte_copy_page_tags(void *kto, const void *kfrom)
> > diff --git a/arch/arm64/include/asm/mte_tag_storage.h 
> > b/arch/arm64/include/asm/mte_tag_storage.h
> > index 6e5d28e607bb..c70ced60a0cd 100644
> > --- a/arch/arm64/include/asm/mte_tag_storage.h
> > +++ b/arch/arm64/include/asm/mte_tag_storage.h
> > @@ -33,6 +33,8 @@ int reserve_tag_storage(struct page *page, int order, 
> > gfp_t gfp);
> >  void free_tag_storage(struct page *page, int order);
> >  
> >  bool page_tag_storage_reserved(struct page *page);
> > +
> > +vm_fault_t handle_page_missing_tag_storage(struct vm_fault *vmf);
> >  #else
> >  static inline bool tag_storage_enabled(void)
> >  {
> > diff --git a/arch/arm64/include/asm/pgtable-prot.h 
> > b/arch/arm64/include/asm/pgtable-prot.h
> > index e9624f6326dd..85ebb3e352ad 100644
>

Re: [PATCH RFC v2 11/27] arm64: mte: Reserve tag storage memory

2023-11-30 Thread Alexandru Elisei
Hi,

On Wed, Nov 29, 2023 at 05:44:24PM +0900, Hyesoo Yu wrote:
> Hello.
> 
> On Sun, Nov 19, 2023 at 04:57:05PM +, Alexandru Elisei wrote:
> > Allow the kernel to get the size and location of the MTE tag storage
> > regions from the DTB. This memory is marked as reserved for now.
> > 
> > The DTB node for the tag storage region is defined as:
> > 
> > tags0: tag-storage@8f800 {
> > compatible = "arm,mte-tag-storage";
> > reg = <0x08 0xf800 0x00 0x400>;
> > block-size = <0x1000>;
> > memory = <>;// Associated tagged memory node
> > };
> >
> 
> How about using compatible = "shared-dma-pool" like below ?
> 
> _memory {
>   tags0: tag0@8f800 {
>   compatible = "arm,mte-tag-storage";
>   reg = <0x08 0xf800 0x00 0x400>;
>   };
> }
> 
> tag-storage {
> compatible = "arm,mte-tag-storage";
>   memory-region = <>;
> memory = <>;
>   block-size = <0x1000>;
> }

I'm sorry, but I don't follow where compatible = "shared-dma-pool" fits
with the examples.

> 
> And then, the activation of CMA would be performed in the CMA code.
> We just can get the region information from memory-region and allocate it 
> directly
> like alloc_contig_range, take_page_off_buddy. It seems like we can remove a 
> lots of code.

For the next iteration I am planning to integrate the code more tightly
with CMA, so any suggestions to that effect are very welcome :)

> 
> > The tag storage region represents the largest contiguous memory region that
> > holds all the tags for the associated contiguous memory region which can be
> > tagged. For example, for a 32GB contiguous tagged memory the corresponding
> > tag storage region is 1GB of contiguous memory, not two adjacent 512M of
> > tag storage memory.
> > 
> > "block-size" represents the minimum multiple of 4K of tag storage where all
> > the tags stored in the block correspond to a contiguous memory region. This
> > is needed for platforms where the memory controller interleaves tag writes
> > to memory. For example, if the memory controller interleaves tag writes for
> > 256KB of contiguous memory across 8K of tag storage (2-way interleave),
> > then the correct value for "block-size" is 0x2000. This value is a hardware
> > property, independent of the selected kernel page size.
> >
> 
> Is it considered for kernel page size like 16K page, 64K page ? The comment 
> says
> it should be a multiple of 4K, but it should be a multiple of the "page size" 
> more accurately.
> Please let me know if there's anything I misunderstood. :-)

The block size in the DTB is a hardware property, it's independent of the
kernel page size, which is a compile time option.

The function get_block_size_pages(), which computes the tag storage block
size as the kernel will use it, takes into account the fact that the
hardware block size is not necessarily a multiple of the kernel page size,
and computes the least common multiple by doing:

(kernel page size in bytes x DTB block size in bytes) / greatest common divisor

As for why the hardware block size is a multiple of 4k, that was chosen
because it will be part of the architecture update. Since the minimum
hardware page size is 4K, it doesn't make much sense to have the DTB
block-size smaller than that.

Hope that makes sense!

Thanks,
Alex

> 
> 
> > Signed-off-by: Alexandru Elisei 
> > ---
> >  arch/arm64/Kconfig   |  12 ++
> >  arch/arm64/include/asm/mte_tag_storage.h |  15 ++
> >  arch/arm64/kernel/Makefile   |   1 +
> >  arch/arm64/kernel/mte_tag_storage.c  | 256 +++
> >  arch/arm64/kernel/setup.c|   7 +
> >  5 files changed, 291 insertions(+)
> >  create mode 100644 arch/arm64/include/asm/mte_tag_storage.h
> >  create mode 100644 arch/arm64/kernel/mte_tag_storage.c
> > 
> > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> > index 7b071a00425d..fe8276fdc7a8 100644
> > --- a/arch/arm64/Kconfig
> > +++ b/arch/arm64/Kconfig
> > @@ -2062,6 +2062,18 @@ config ARM64_MTE
> >  
> >   Documentation/arch/arm64/memory-tagging-extension.rst.
> >  
> > +if ARM64_MTE
> > +config ARM64_MTE_TAG_STORAGE
> > +   bool "Dynamic MTE tag storage management"
> > +   help
> > + Adds support for dynamic management of the memory used by the hardware
> > + for storing MTE tags. This memory, u

Re: [PATCH RFC v2 18/27] arm64: mte: Reserve tag block for the zero page

2023-11-29 Thread Alexandru Elisei
Hi,

On Wed, Nov 29, 2023 at 02:13:50PM +0100, David Hildenbrand wrote:
> On 29.11.23 12:30, Alexandru Elisei wrote:
> > On Tue, Nov 28, 2023 at 06:06:54PM +0100, David Hildenbrand wrote:
> > > On 19.11.23 17:57, Alexandru Elisei wrote:
> > > > On arm64, the zero page receives special treatment by having the tagged
> > > > flag set on MTE initialization, not when the page is mapped in a process
> > > > address space. Reserve the corresponding tag block when tag storage
> > > > management is being activated.
> > > 
> > > Out of curiosity: why does the shared zeropage require tagged storage? 
> > > What
> > > about the huge zeropage?
> > 
> > There are two different tags that are used for tag checking: the logical
> > tag, the tag embedded in bits 59:56 of an address, and the physical tag
> > corresponding to the address. This tag is stored in a separate memory
> > location, called tag storage. When an access is performed, hardware
> > compares the logical tag (from the address) with the physical tag (from the
> > tag storage). If they match, the access is permitted.
> 
> Ack, matches my understanding.
> 
> > 
> > The physical tag is set with special instructions.
> > 
> > Userspace pointers have bits 59:56 zero. If the pointer is in a VMA with
> > MTE enabled, then for userspace to be able to access this address, the
> > physical tag must also be 0b.
> > 
> > To make it easier on userspace, when a page is first mapped as tagged, its
> > tags are cleared by the kernel; this way, userspace can access the address
> > immediately, without clearing the physical tags beforehand. Another reason
> > for clearing the physical tags when a page is mapped as tagged would be to
> > avoid leaking uninitialized tags to userspace.
> 
> Make sense. Zero it just like we zero page content.
> 
> > 
> > The zero page is special, because the physical tags are not zeroed every
> > time the page is mapped in a process; instead, the zero page is marked as
> > tagged (by setting a page flag) and the physical tags are zeroed only once,
> > when MTE is enabled at boot.
> 
> Makes sense.
> 
> > 
> > All of this means that when tag storage is enabled, which happens after MTE
> > is enabled, the tag storage corresponding to the zero page is already in
> > use and must be rezerved, and it can never be used for data allocations.
> > 
> > I hope all of the above makes sense. I can also put it in the commit
> > message :)
> 
> Yes, makes sense!
> 
> > 
> > As for the zero huge page, the MTE code in the kernel treats it like a
> > regular page, and it zeroes the tags when it is mapped as tagged in a
> > process. I agree that this might not be the best solution from a
> > performance perspective, but it has worked so far.
> 
> What if user space were to change the tag of that shared resource?
> 
> Having a tag != 0 doesn't make sense for such a shared resource, so I
> suspect modifying the tag is like a write event: trigger write-fault -> COW.

Yes, modifying the tag is a write event.

> 
> > 
> > With tag storage management enabled, set_pte_at()->mte_sync_tags() will
> > discover that the huge zero page doesn't have tag storage reserved, the
> > table entry will be mapped as invalid to use the page fault-on-access
> > mechanism that I introduce later in the series [1] to reserve tag storage,
> 
> I assume (without looking at the code) that you took proper care of possible
> races.
> 
> Thanks for goind into detail!

No problem.

Alex

> 
> 
> -- 
> Cheers,
> 
> David / dhildenb
> 



Re: [PATCH RFC v2 16/27] arm64: mte: Manage tag storage on page allocation

2023-11-29 Thread Alexandru Elisei
Hi,

On Wed, Nov 29, 2023 at 06:10:40PM +0900, Hyesoo Yu wrote:
> On Sun, Nov 19, 2023 at 04:57:10PM +0000, Alexandru Elisei wrote:
> > [..]
> > +static int order_to_num_blocks(int order)
> > +{
> > +   return max((1 << order) / 32, 1);
> > +}
> > [..]
> > +int reserve_tag_storage(struct page *page, int order, gfp_t gfp)
> > +{
> > +   unsigned long start_block, end_block;
> > +   struct tag_region *region;
> > +   unsigned long block;
> > +   unsigned long flags;
> > +   unsigned int tries;
> > +   int ret = 0;
> > +
> > +   VM_WARN_ON_ONCE(!preemptible());
> > +
> > +   if (page_tag_storage_reserved(page))
> > +   return 0;
> > +
> > +   /*
> > +* __alloc_contig_migrate_range() ignores gfp when allocating the
> > +* destination page for migration. Regardless, massage gfp flags and
> > +* remove __GFP_TAGGED to avoid recursion in case gfp stops being
> > +* ignored.
> > +*/
> > +   gfp &= ~__GFP_TAGGED;
> > +   if (!(gfp & __GFP_NORETRY))
> > +   gfp |= __GFP_RETRY_MAYFAIL;
> > +
> > +   ret = tag_storage_find_block(page, _block, );
> > +   if (WARN_ONCE(ret, "Missing tag storage block for pfn 0x%lx", 
> > page_to_pfn(page)))
> > +   return 0;
> > +   end_block = start_block + order_to_num_blocks(order) * 
> > region->block_size;
> > +
> 
> Hello.
> 
> If the page size is 4K,  block size is 2 (block size bytes 8K), and order is 
> 6,
> then we need 2 pages for the tag. However according to the equation, 
> order_to_num_blocks
> is 2 and block_size is also 2, so end block will be incremented by 4.
> 
> However we actually only need 8K of tag, right for 256K ?
> Could you explain order_to_num_blocks * region->block_size more detail ?

I think you are correct, thank you for pointing it out. The formula should
probably be something like:

static int order_to_num_blocks(int order, u32 block_size)
{
int num_tag_pages = max((1 << order) / 32, 1);

return DIV_ROUND_UP(num_tag_pages, block_size);
}

and that will make end_block = start_block + 2 in your scenario.

Does that look correct to you?

Thanks,
Alex

> 
> Thanks,
> Regards.
> 
> > +   mutex_lock(_blocks_lock);
> > +
> > +   /* Check again, this time with the lock held. */
> > +   if (page_tag_storage_reserved(page))
> > +   goto out_unlock;
> > +
> > +   /* Make sure existing entries are not freed from out under out feet. */
> > +   xa_lock_irqsave(_blocks_reserved, flags);
> > +   for (block = start_block; block < end_block; block += 
> > region->block_size) {
> > +   if (tag_storage_block_is_reserved(block))
> > +   block_ref_add(block, region, order);
> > +   }
> > +   xa_unlock_irqrestore(_blocks_reserved, flags);
> > +
> > +   for (block = start_block; block < end_block; block += 
> > region->block_size) {
> > +   /* Refcount incremented above. */
> > +   if (tag_storage_block_is_reserved(block))
> > +   continue;
> > +
> > +   tries = 3;
> > +   while (tries--) {
> > +   ret = alloc_contig_range(block, block + 
> > region->block_size, MIGRATE_CMA, gfp);
> > +   if (ret == 0 || ret != -EBUSY)
> > +   break;
> > +   }
> > +
> > +   if (ret)
> > +   goto out_error;
> > +
> > +   ret = tag_storage_reserve_block(block, region, order);
> > +   if (ret) {
> > +   free_contig_range(block, region->block_size);
> > +   goto out_error;
> > +   }
> > +
> > +   count_vm_events(CMA_ALLOC_SUCCESS, region->block_size);
> > +   }
> > +
> > +   page_set_tag_storage_reserved(page, order);
> > +out_unlock:
> > +   mutex_unlock(_blocks_lock);
> > +
> > +   return 0;
> > +
> > +out_error:
> > +   xa_lock_irqsave(_blocks_reserved, flags);
> > +   for (block = start_block; block < end_block; block += 
> > region->block_size) {
> > +   if (tag_storage_block_is_reserved(block) &&
> > +   block_ref_sub_return(block, region, order) == 1) {
> > +   __xa_erase(_blocks_reserved, block);
> > +   free_contig_range(block, region->block_size);
> > +   }
> > +   }
> > +   xa_unlock_irqrestore(_blocks_reserved, flags);
> > +
> > +   mutex_unlock(_b

Re: [PATCH RFC v2 20/27] mm: hugepage: Handle huge page fault on access

2023-11-29 Thread Alexandru Elisei
Hi,

On Tue, Nov 28, 2023 at 06:56:34PM +0100, David Hildenbrand wrote:
> On 19.11.23 17:57, Alexandru Elisei wrote:
> > Handle PAGE_FAULT_ON_ACCESS faults for huge pages in a similar way to
> > regular pages.
> > 
> > Signed-off-by: Alexandru Elisei 
> > ---
> 
> Same comments :)

Yes, will have a look at this fault handling path too :)

Thanks,
Alex

> 
> -- 
> Cheers,
> 
> David / dhildenb
> 



Re: [PATCH RFC v2 19/27] mm: mprotect: Introduce PAGE_FAULT_ON_ACCESS for mprotect(PROT_MTE)

2023-11-29 Thread Alexandru Elisei
Hi,

On Tue, Nov 28, 2023 at 06:55:18PM +0100, David Hildenbrand wrote:
> On 19.11.23 17:57, Alexandru Elisei wrote:
> > To enable tagging on a memory range, userspace can use mprotect() with the
> > PROT_MTE access flag. Pages already mapped in the VMA don't have the
> > associated tag storage block reserved, so mark the PTEs as
> > PAGE_FAULT_ON_ACCESS to trigger a fault next time they are accessed, and
> > reserve the tag storage on the fault path.
> 
> That sounds alot like fake PROT_NONE. Would there be a way to unify hat

Yes, arm64 basically defines PAGE_FAULT_ON_ACCESS as PAGE_NONE |
PTE_TAG_STORAGE_NONE.

> handling and simply reuse pte_protnone()? For example, could we special case
> on VMA flags?
> 
> Like, don't do NUMA hinting in these special VMAs. Then, have something
> like:
> 
> if (pte_protnone(vmf->orig_pte))
>   return handle_pte_protnone(vmf);
> 
> In there, special case on the VMA flags.

Your suggestion from the follow-up reply that an arch should know if it needs to
do something was spot on, arm64 can use the software bit in the translation
table entry for that.

So what you are proposing is this:

* Rename do_numa_page->handle_pte_protnone
* At some point in the do_numa_page (now renamed to handle_pte_protnone) flow,
  decide if pte_protnone() has been set for an arch specific reason or because
  of automatic NUMA balancing.
* if pte_protnone() has been set by an architecture, then let the architecture
  handle the fault.

If I understood you correctly, that's a good idea, and should be easy to
implement.

> 
> I *suspect* that handle_page_missing_tag_storage() stole (sorry :P) some

Indeed, most of the code is taken as-is from do_numa_page().

> code from the prot_none handling path. At least the recovery path and
> writability handling looks like it better be located shared in
> handle_pte_protnone() as well.

Yes, I agree.

Thanks,
Alex

> 
> That might take some magic out of this patch.
> 
> -- 
> Cheers,
> 
> David / dhildenb
> 



Re: [PATCH RFC v2 18/27] arm64: mte: Reserve tag block for the zero page

2023-11-29 Thread Alexandru Elisei
On Tue, Nov 28, 2023 at 06:06:54PM +0100, David Hildenbrand wrote:
> On 19.11.23 17:57, Alexandru Elisei wrote:
> > On arm64, the zero page receives special treatment by having the tagged
> > flag set on MTE initialization, not when the page is mapped in a process
> > address space. Reserve the corresponding tag block when tag storage
> > management is being activated.
> 
> Out of curiosity: why does the shared zeropage require tagged storage? What
> about the huge zeropage?

There are two different tags that are used for tag checking: the logical
tag, the tag embedded in bits 59:56 of an address, and the physical tag
corresponding to the address. This tag is stored in a separate memory
location, called tag storage. When an access is performed, hardware
compares the logical tag (from the address) with the physical tag (from the
tag storage). If they match, the access is permitted.

The physical tag is set with special instructions.

Userspace pointers have bits 59:56 zero. If the pointer is in a VMA with
MTE enabled, then for userspace to be able to access this address, the
physical tag must also be 0b.

To make it easier on userspace, when a page is first mapped as tagged, its
tags are cleared by the kernel; this way, userspace can access the address
immediately, without clearing the physical tags beforehand. Another reason
for clearing the physical tags when a page is mapped as tagged would be to
avoid leaking uninitialized tags to userspace.

The zero page is special, because the physical tags are not zeroed every
time the page is mapped in a process; instead, the zero page is marked as
tagged (by setting a page flag) and the physical tags are zeroed only once,
when MTE is enabled at boot.

All of this means that when tag storage is enabled, which happens after MTE
is enabled, the tag storage corresponding to the zero page is already in
use and must be rezerved, and it can never be used for data allocations.

I hope all of the above makes sense. I can also put it in the commit
message :)

As for the zero huge page, the MTE code in the kernel treats it like a
regular page, and it zeroes the tags when it is mapped as tagged in a
process. I agree that this might not be the best solution from a
performance perspective, but it has worked so far.

With tag storage management enabled, set_pte_at()->mte_sync_tags() will
discover that the huge zero page doesn't have tag storage reserved, the
table entry will be mapped as invalid to use the page fault-on-access
mechanism that I introduce later in the series [1] to reserve tag storage,
and after that set_pte_at() will zero the physical tags.

[1] https://lore.kernel.org/all/20231119165721.9849-20-alexandru.eli...@arm.com/

Thanks,
Alex

> 
> -- 
> Cheers,
> 
> David / dhildenb
> 



Re: [PATCH RFC v2 13/27] arm64: mte: Make tag storage depend on ARCH_KEEP_MEMBLOCK

2023-11-29 Thread Alexandru Elisei
Hi,

On Tue, Nov 28, 2023 at 06:05:20PM +0100, David Hildenbrand wrote:
> On 27.11.23 16:04, Alexandru Elisei wrote:
> > Hi,
> > 
> > On Fri, Nov 24, 2023 at 08:51:38PM +0100, David Hildenbrand wrote:
> > > On 19.11.23 17:57, Alexandru Elisei wrote:
> > > > Tag storage memory requires that the tag storage pages used for data are
> > > > always migratable when they need to be repurposed to store tags.
> > > > 
> > > > If ARCH_KEEP_MEMBLOCK is enabled, kexec will scan all non-reserved
> > > > memblocks to find a suitable location for copying the kernel image. The
> > > > kernel image, once loaded, cannot be moved to another location in 
> > > > physical
> > > > memory. The initialization code for the tag storage reserves the 
> > > > memblocks
> > > > for the tag storage pages, which means kexec will not use them, and the 
> > > > tag
> > > > storage pages can be migrated at any time, which is the desired 
> > > > behaviour.
> > > > 
> > > > However, if ARCH_KEEP_MEMBLOCK is not selected, kexec will not skip a
> > > > region unless the memory resource has the 
> > > > IORESOURCE_SYSRAM_DRIVER_MANAGED
> > > > flag, which isn't currently set by the tag storage initialization code.
> > > > 
> > > > Make ARM64_MTE_TAG_STORAGE depend on ARCH_KEEP_MEMBLOCK to make it 
> > > > explicit
> > > > that that the Kconfig option required for it to work correctly.
> > > > 
> > > > Signed-off-by: Alexandru Elisei 
> > > > ---
> > > >arch/arm64/Kconfig | 1 +
> > > >1 file changed, 1 insertion(+)
> > > > 
> > > > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> > > > index 047487046e8f..efa5b7958169 100644
> > > > --- a/arch/arm64/Kconfig
> > > > +++ b/arch/arm64/Kconfig
> > > > @@ -2065,6 +2065,7 @@ config ARM64_MTE
> > > >if ARM64_MTE
> > > >config ARM64_MTE_TAG_STORAGE
> > > > bool "Dynamic MTE tag storage management"
> > > > +   depends on ARCH_KEEP_MEMBLOCK
> > > > select CONFIG_CMA
> > > > help
> > > >   Adds support for dynamic management of the memory used by the 
> > > > hardware
> > > 
> > > Doesn't arm64 select that unconditionally? Why is this required then?
> > 
> > I've added this patch to make the dependancy explicit. If, in the future, 
> > arm64
> > stops selecting ARCH_KEEP_MEMBLOCK, I thinkg it would be very easy to miss 
> > the
> > fact that tag storage depends on it. So this patch is not required per-se, 
> > it's
> > there to document the dependancy.
> 
> I see. Could you add some static_assert / BUILD_BUG_ON instead?

I can do that, sure.

Thanks,
Alex

> 
> I suspect there are plenty other (undocumented) reasons why
> ARCH_KEEP_MEMBLOCK has to be enabled for now, and none sets
> ARCH_KEEP_MEMBLOCK, I suspect/
> 
> -- 
> Cheers,
> 
> David / dhildenb
> 



Re: [PATCH RFC v2 12/27] arm64: mte: Add tag storage pages to the MIGRATE_CMA migratetype

2023-11-29 Thread Alexandru Elisei
Hi,

On Tue, Nov 28, 2023 at 06:03:52PM +0100, David Hildenbrand wrote:
> On 27.11.23 16:01, Alexandru Elisei wrote:
> > Hi David,
> > 
> > On Fri, Nov 24, 2023 at 08:40:55PM +0100, David Hildenbrand wrote:
> > > On 19.11.23 17:57, Alexandru Elisei wrote:
> > > > Add the MTE tag storage pages to the MIGRATE_CMA migratetype, which 
> > > > allows
> > > > the page allocator to manage them like regular pages.
> > > > 
> > > > Ths migratype lends the pages some very desirable properties:
> > > > 
> > > > * They cannot be longterm pinned, meaning they will always be 
> > > > migratable.
> > > > 
> > > > * The pages can be allocated explicitely by using their PFN (with
> > > > alloc_contig_range()) when they are needed to store tags.
> > > > 
> > > > Signed-off-by: Alexandru Elisei 
> > > > ---
> > > >arch/arm64/Kconfig  |  1 +
> > > >arch/arm64/kernel/mte_tag_storage.c | 68 
> > > > +
> > > >include/linux/mmzone.h  |  5 +++
> > > >mm/internal.h   |  3 --
> > > >4 files changed, 74 insertions(+), 3 deletions(-)
> > > > 
> > > > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> > > > index fe8276fdc7a8..047487046e8f 100644
> > > > --- a/arch/arm64/Kconfig
> > > > +++ b/arch/arm64/Kconfig
> > > > @@ -2065,6 +2065,7 @@ config ARM64_MTE
> > > >if ARM64_MTE
> > > >config ARM64_MTE_TAG_STORAGE
> > > > bool "Dynamic MTE tag storage management"
> > > > +   select CONFIG_CMA
> > > > help
> > > >   Adds support for dynamic management of the memory used by the 
> > > > hardware
> > > >   for storing MTE tags. This memory, unlike normal memory, 
> > > > cannot be
> > > > diff --git a/arch/arm64/kernel/mte_tag_storage.c 
> > > > b/arch/arm64/kernel/mte_tag_storage.c
> > > > index fa6267ef8392..427f4f1909f3 100644
> > > > --- a/arch/arm64/kernel/mte_tag_storage.c
> > > > +++ b/arch/arm64/kernel/mte_tag_storage.c
> > > > @@ -5,10 +5,12 @@
> > > > * Copyright (C) 2023 ARM Ltd.
> > > > */
> > > > +#include 
> > > >#include 
> > > >#include 
> > > >#include 
> > > >#include 
> > > > +#include 
> > > >#include 
> > > >#include 
> > > >#include 
> > > > @@ -189,6 +191,14 @@ static int __init fdt_init_tag_storage(unsigned 
> > > > long node, const char *uname,
> > > > return ret;
> > > > }
> > > > +   /* Pages are managed in pageblock_nr_pages chunks */
> > > > +   if (!IS_ALIGNED(tag_range->start | range_len(tag_range), 
> > > > pageblock_nr_pages)) {
> > > > +   pr_err("Tag storage region 0x%llx-0x%llx not aligned to 
> > > > pageblock size 0x%llx",
> > > > +  PFN_PHYS(tag_range->start), 
> > > > PFN_PHYS(tag_range->end),
> > > > +  PFN_PHYS(pageblock_nr_pages));
> > > > +   return -EINVAL;
> > > > +   }
> > > > +
> > > > ret = tag_storage_get_memory_node(node, _node);
> > > > if (ret)
> > > > return ret;
> > > > @@ -254,3 +264,61 @@ void __init mte_tag_storage_init(void)
> > > > pr_info("MTE tag storage region management disabled");
> > > > }
> > > >}
> > > > +
> > > > +static int __init mte_tag_storage_activate_regions(void)
> > > > +{
> > > > +   phys_addr_t dram_start, dram_end;
> > > > +   struct range *tag_range;
> > > > +   unsigned long pfn;
> > > > +   int i, ret;
> > > > +
> > > > +   if (num_tag_regions == 0)
> > > > +   return 0;
> > > > +
> > > > +   dram_start = memblock_start_of_DRAM();
> > > > +   dram_end = memblock_end_of_DRAM();
> > > > +
> > > > +   for (i = 0; i < num_tag_regions; i++) {
> > > > +   tag_range = _regions[i].tag_range;
> > > > +   /*
> > >

Re: [PATCH RFC v2 04/27] mm: migrate/mempolicy: Add hook to modify migration target gfp

2023-11-28 Thread Alexandru Elisei
Hi,

On Tue, Nov 28, 2023 at 08:49:57AM +0200, Mike Rapoport wrote:
> On Mon, Nov 27, 2023 at 11:52:56AM +0000, Alexandru Elisei wrote:
> > Hi Mike,
> > 
> > I really appreciate you having a look!
> > 
> > On Sat, Nov 25, 2023 at 12:03:22PM +0200, Mike Rapoport wrote:
> > > On Sun, Nov 19, 2023 at 04:56:58PM +, Alexandru Elisei wrote:
> > > > It might be desirable for an architecture to modify the gfp flags used 
> > > > to
> > > > allocate the destination page for migration based on the page that it is
> > > > being replaced. For example, if an architectures has metadata associated
> > > > with a page (like arm64, when the memory tagging extension is 
> > > > implemented),
> > > > it can request that the destination page similarly has storage for tags
> > > > already allocated.
> > > > 
> > > > No functional change.
> > > > 
> > > > Signed-off-by: Alexandru Elisei 
> > > > ---
> > > >  include/linux/migrate.h | 4 
> > > >  mm/mempolicy.c  | 2 ++
> > > >  mm/migrate.c| 3 +++
> > > >  3 files changed, 9 insertions(+)
> > > > 
> > > > diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> > > > index 2ce13e8a309b..0acef592043c 100644
> > > > --- a/include/linux/migrate.h
> > > > +++ b/include/linux/migrate.h
> > > > @@ -60,6 +60,10 @@ struct movable_operations {
> > > >  /* Defined in mm/debug.c: */
> > > >  extern const char *migrate_reason_names[MR_TYPES];
> > > >  
> > > > +#ifndef arch_migration_target_gfp
> > > > +#define arch_migration_target_gfp(src, gfp) 0
> > > > +#endif
> > > > +
> > > >  #ifdef CONFIG_MIGRATION
> > > >  
> > > >  void putback_movable_pages(struct list_head *l);
> > > > diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> > > > index 10a590ee1c89..50bc43ab50d6 100644
> > > > --- a/mm/mempolicy.c
> > > > +++ b/mm/mempolicy.c
> > > > @@ -1182,6 +1182,7 @@ static struct folio 
> > > > *alloc_migration_target_by_mpol(struct folio *src,
> > > >  
> > > > h = folio_hstate(src);
> > > > gfp = htlb_alloc_mask(h);
> > > > +   gfp |= arch_migration_target_gfp(src, gfp);
> > > 
> > > I think it'll be more robust to have arch_migration_target_gfp() to modify
> > > the flags and return the new mask with added (or potentially removed)
> > > flags.
> > 
> > I did it this way so an arch won't be able to remove flags set by the MM 
> > code.
> > There's a similar pattern in do_mmap() -> calc_vm_flag_bits() ->
> > arch_calc_vm_flag_bits().
> 
> Ok, just add a sentence about it to the commit message.

Great, will do that!

Thanks,
Alex

>  
> > Thanks,
> > Alex
> > 
> > > 
> > > > nodemask = policy_nodemask(gfp, pol, ilx, );
> > > > return alloc_hugetlb_folio_nodemask(h, nid, nodemask, 
> > > > gfp);
> > > > }
> > > > @@ -1190,6 +1191,7 @@ static struct folio 
> > > > *alloc_migration_target_by_mpol(struct folio *src,
> > > > gfp = GFP_TRANSHUGE;
> > > > else
> > > > gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | 
> > > > __GFP_COMP;
> > > > +   gfp |= arch_migration_target_gfp(src, gfp);
> > > >  
> > > > page = alloc_pages_mpol(gfp, order, pol, ilx, nid);
> > > > return page_rmappable_folio(page);
> > > 
> > > -- 
> > > Sincerely yours,
> > > Mike.
> > > 
> 
> -- 
> Sincerely yours,
> Mike.
> 



Re: [PATCH RFC v2 05/27] mm: page_alloc: Add an arch hook to allow prep_new_page() to fail

2023-11-28 Thread Alexandru Elisei
Hi,

On Tue, Nov 28, 2023 at 05:57:31PM +0100, David Hildenbrand wrote:
> On 27.11.23 13:09, Alexandru Elisei wrote:
> > Hi,
> > 
> > Thank you so much for your comments, there are genuinely useful.
> > 
> > On Fri, Nov 24, 2023 at 08:35:47PM +0100, David Hildenbrand wrote:
> > > On 19.11.23 17:56, Alexandru Elisei wrote:
> > > > Introduce arch_prep_new_page(), which will be used by arm64 to reserve 
> > > > tag
> > > > storage for an allocated page. Reserving tag storage can fail, for 
> > > > example,
> > > > if the tag storage page has a short pin on it, so allow prep_new_page() 
> > > > ->
> > > > arch_prep_new_page() to similarly fail.
> > > 
> > > But what are the side-effects of this? How does the calling code recover?
> > > 
> > > E.g., what if we need to populate a page into user space, but that
> > > particular page we allocated fails to be prepared? So we inject a signal
> > > into that poor process?
> > 
> > When the page fails to be prepared, it is put back to the tail of the
> > freelist with __free_one_page(.., FPI_TO_TAIL). If all the allocation paths
> > are exhausted and no page has been found for which tag storage has been
> > reserved, then that's treated like an OOM situation.
> > 
> > I have been thinking about this, and I think I can simplify the code by
> > making tag reservation a best effort approach. The page can be allocated
> > even if reserving tag storage fails, but the page is marked as invalid in
> > set_pte_at() (PAGE_NONE + an extra bit to tell arm64 that it needs tag
> > storage) and next time it is accessed, arm64 will reserve tag storage in
> > the fault handling code (the mechanism for that is implemented in patch #19
> > of the series, "mm: mprotect: Introduce PAGE_FAULT_ON_ACCESS for
> > mprotect(PROT_MTE)").
> > 
> > With this new approach, prep_new_page() stays the way it is, and no further
> > changes are required for the page allocator, as there are already arch
> > callbacks that can be used for that, for example tag_clear_highpage() and
> > arch_alloc_page(). The downside is extra page faults, which might impact
> > performance.
> > 
> > What do you think?
> 
> That sounds a lot more robust, compared to intermittent failures to allocate
> pages.

Great, thank you for the feedback, I will use this approach for the next
iteration of the series.

Thanks,
Alex

> 
> -- 
> Cheers,
> 
> David / dhildenb
> 



Re: [PATCH RFC v2 15/27] arm64: mte: Check that tag storage blocks are in the same zone

2023-11-27 Thread Alexandru Elisei
Hi,

On Fri, Nov 24, 2023 at 08:56:59PM +0100, David Hildenbrand wrote:
> On 19.11.23 17:57, Alexandru Elisei wrote:
> > alloc_contig_range() requires that the requested pages are in the same
> > zone. Check that this is indeed the case before initializing the tag
> > storage blocks.
> > 
> > Signed-off-by: Alexandru Elisei 
> > ---
> >   arch/arm64/kernel/mte_tag_storage.c | 33 +
> >   1 file changed, 33 insertions(+)
> > 
> > diff --git a/arch/arm64/kernel/mte_tag_storage.c 
> > b/arch/arm64/kernel/mte_tag_storage.c
> > index 8b9bedf7575d..fd63430d4dc0 100644
> > --- a/arch/arm64/kernel/mte_tag_storage.c
> > +++ b/arch/arm64/kernel/mte_tag_storage.c
> > @@ -265,6 +265,35 @@ void __init mte_tag_storage_init(void)
> > }
> >   }
> > +/* alloc_contig_range() requires all pages to be in the same zone. */
> > +static int __init mte_tag_storage_check_zone(void)
> > +{
> > +   struct range *tag_range;
> > +   struct zone *zone;
> > +   unsigned long pfn;
> > +   u32 block_size;
> > +   int i, j;
> > +
> > +   for (i = 0; i < num_tag_regions; i++) {
> > +   block_size = tag_regions[i].block_size;
> > +   if (block_size == 1)
> > +   continue;
> > +
> > +   tag_range = _regions[i].tag_range;
> > +   for (pfn = tag_range->start; pfn <= tag_range->end; pfn += 
> > block_size) {
> > +   zone = page_zone(pfn_to_page(pfn));
> > +   for (j = 1; j < block_size; j++) {
> > +   if (page_zone(pfn_to_page(pfn + j)) != zone) {
> > +   pr_err("Tag storage block pages in 
> > different zones");
> > +   return -EINVAL;
> > +   }
> > +   }
> > +   }
> > +   }
> > +
> > +return 0;
> > +}
> > +
> 
> Looks like something that ordinary CMA provides. See cma_activate_area().

Indeed.

> 
> Can't we find a way to let CMA do CMA thingies and only be a user of that?
> What would be required to make the performance issue you spelled out in the
> cover letter be gone and not have to open-code that in arch code?

I've replied with a possible solution here [1].

[1] https://lore.kernel.org/all/ZWSvMYMjFLFZ-abv@raptor/

Thanks,
Alex

> 
> -- 
> Cheers,
> 
> David / dhildenb
> 



Re: [PATCH RFC v2 14/27] arm64: mte: Disable dynamic tag storage management if HW KASAN is enabled

2023-11-27 Thread Alexandru Elisei
Hi,

On Fri, Nov 24, 2023 at 08:54:12PM +0100, David Hildenbrand wrote:
> On 19.11.23 17:57, Alexandru Elisei wrote:
> > To be able to reserve the tag storage associated with a page requires that
> > the tag storage page can be migrated.
> > 
> > When HW KASAN is enabled, the kernel allocates pages, which are now tagged,
> > in non-preemptible contexts, which can make reserving the associate tag
> > storage impossible.
> 
> I assume that it's the only in-kernel user that actually requires tagged
> memory (besides for user space), correct?

Indeed, this is the case. I'll expand the commit message to be more clear about
it.

Thanks,
Alex

> 
> -- 
> Cheers,
> 
> David / dhildenb
> 



Re: [PATCH RFC v2 13/27] arm64: mte: Make tag storage depend on ARCH_KEEP_MEMBLOCK

2023-11-27 Thread Alexandru Elisei
Hi,

On Fri, Nov 24, 2023 at 08:51:38PM +0100, David Hildenbrand wrote:
> On 19.11.23 17:57, Alexandru Elisei wrote:
> > Tag storage memory requires that the tag storage pages used for data are
> > always migratable when they need to be repurposed to store tags.
> > 
> > If ARCH_KEEP_MEMBLOCK is enabled, kexec will scan all non-reserved
> > memblocks to find a suitable location for copying the kernel image. The
> > kernel image, once loaded, cannot be moved to another location in physical
> > memory. The initialization code for the tag storage reserves the memblocks
> > for the tag storage pages, which means kexec will not use them, and the tag
> > storage pages can be migrated at any time, which is the desired behaviour.
> > 
> > However, if ARCH_KEEP_MEMBLOCK is not selected, kexec will not skip a
> > region unless the memory resource has the IORESOURCE_SYSRAM_DRIVER_MANAGED
> > flag, which isn't currently set by the tag storage initialization code.
> > 
> > Make ARM64_MTE_TAG_STORAGE depend on ARCH_KEEP_MEMBLOCK to make it explicit
> > that that the Kconfig option required for it to work correctly.
> > 
> > Signed-off-by: Alexandru Elisei 
> > ---
> >   arch/arm64/Kconfig | 1 +
> >   1 file changed, 1 insertion(+)
> > 
> > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> > index 047487046e8f..efa5b7958169 100644
> > --- a/arch/arm64/Kconfig
> > +++ b/arch/arm64/Kconfig
> > @@ -2065,6 +2065,7 @@ config ARM64_MTE
> >   if ARM64_MTE
> >   config ARM64_MTE_TAG_STORAGE
> > bool "Dynamic MTE tag storage management"
> > +   depends on ARCH_KEEP_MEMBLOCK
> > select CONFIG_CMA
> > help
> >   Adds support for dynamic management of the memory used by the hardware
> 
> Doesn't arm64 select that unconditionally? Why is this required then?

I've added this patch to make the dependancy explicit. If, in the future, arm64
stops selecting ARCH_KEEP_MEMBLOCK, I thinkg it would be very easy to miss the
fact that tag storage depends on it. So this patch is not required per-se, it's
there to document the dependancy.

Thanks,
Alex

> 
> -- 
> Cheers,
> 
> David / dhildenb
> 



Re: [PATCH RFC v2 12/27] arm64: mte: Add tag storage pages to the MIGRATE_CMA migratetype

2023-11-27 Thread Alexandru Elisei
Hi David,

On Fri, Nov 24, 2023 at 08:40:55PM +0100, David Hildenbrand wrote:
> On 19.11.23 17:57, Alexandru Elisei wrote:
> > Add the MTE tag storage pages to the MIGRATE_CMA migratetype, which allows
> > the page allocator to manage them like regular pages.
> > 
> > Ths migratype lends the pages some very desirable properties:
> > 
> > * They cannot be longterm pinned, meaning they will always be migratable.
> > 
> > * The pages can be allocated explicitely by using their PFN (with
> >alloc_contig_range()) when they are needed to store tags.
> > 
> > Signed-off-by: Alexandru Elisei 
> > ---
> >   arch/arm64/Kconfig  |  1 +
> >   arch/arm64/kernel/mte_tag_storage.c | 68 +
> >   include/linux/mmzone.h  |  5 +++
> >   mm/internal.h   |  3 --
> >   4 files changed, 74 insertions(+), 3 deletions(-)
> > 
> > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> > index fe8276fdc7a8..047487046e8f 100644
> > --- a/arch/arm64/Kconfig
> > +++ b/arch/arm64/Kconfig
> > @@ -2065,6 +2065,7 @@ config ARM64_MTE
> >   if ARM64_MTE
> >   config ARM64_MTE_TAG_STORAGE
> > bool "Dynamic MTE tag storage management"
> > +   select CONFIG_CMA
> > help
> >   Adds support for dynamic management of the memory used by the hardware
> >   for storing MTE tags. This memory, unlike normal memory, cannot be
> > diff --git a/arch/arm64/kernel/mte_tag_storage.c 
> > b/arch/arm64/kernel/mte_tag_storage.c
> > index fa6267ef8392..427f4f1909f3 100644
> > --- a/arch/arm64/kernel/mte_tag_storage.c
> > +++ b/arch/arm64/kernel/mte_tag_storage.c
> > @@ -5,10 +5,12 @@
> >* Copyright (C) 2023 ARM Ltd.
> >*/
> > +#include 
> >   #include 
> >   #include 
> >   #include 
> >   #include 
> > +#include 
> >   #include 
> >   #include 
> >   #include 
> > @@ -189,6 +191,14 @@ static int __init fdt_init_tag_storage(unsigned long 
> > node, const char *uname,
> > return ret;
> > }
> > +   /* Pages are managed in pageblock_nr_pages chunks */
> > +   if (!IS_ALIGNED(tag_range->start | range_len(tag_range), 
> > pageblock_nr_pages)) {
> > +   pr_err("Tag storage region 0x%llx-0x%llx not aligned to 
> > pageblock size 0x%llx",
> > +  PFN_PHYS(tag_range->start), PFN_PHYS(tag_range->end),
> > +  PFN_PHYS(pageblock_nr_pages));
> > +   return -EINVAL;
> > +   }
> > +
> > ret = tag_storage_get_memory_node(node, _node);
> > if (ret)
> > return ret;
> > @@ -254,3 +264,61 @@ void __init mte_tag_storage_init(void)
> > pr_info("MTE tag storage region management disabled");
> > }
> >   }
> > +
> > +static int __init mte_tag_storage_activate_regions(void)
> > +{
> > +   phys_addr_t dram_start, dram_end;
> > +   struct range *tag_range;
> > +   unsigned long pfn;
> > +   int i, ret;
> > +
> > +   if (num_tag_regions == 0)
> > +   return 0;
> > +
> > +   dram_start = memblock_start_of_DRAM();
> > +   dram_end = memblock_end_of_DRAM();
> > +
> > +   for (i = 0; i < num_tag_regions; i++) {
> > +   tag_range = _regions[i].tag_range;
> > +   /*
> > +* Tag storage region was clipped by arm64_bootmem_init()
> > +* enforcing addressing limits.
> > +*/
> > +   if (PFN_PHYS(tag_range->start) < dram_start ||
> > +   PFN_PHYS(tag_range->end) >= dram_end) {
> > +   pr_err("Tag storage region 0x%llx-0x%llx outside 
> > addressable memory",
> > +  PFN_PHYS(tag_range->start), 
> > PFN_PHYS(tag_range->end));
> > +   ret = -EINVAL;
> > +   goto out_disabled;
> > +   }
> > +   }
> > +
> > +   /*
> > +* MTE disabled, tag storage pages can be used like any other pages. The
> > +* only restriction is that the pages cannot be used by kexec because
> > +* the memory remains marked as reserved in the memblock allocator.
> > +*/
> > +   if (!system_supports_mte()) {
> > +   for (i = 0; i< num_tag_regions; i++) {
> > +   tag_range = _regions[i].tag_range;
> > +   for (pfn = tag_range->start; pfn <= tag_range->end; 
> > pfn++)
> &g

Re: [PATCH RFC v2 06/27] mm: page_alloc: Allow an arch to hook early into free_pages_prepare()

2023-11-27 Thread Alexandru Elisei
Hi,

On Fri, Nov 24, 2023 at 08:36:52PM +0100, David Hildenbrand wrote:
> On 19.11.23 17:57, Alexandru Elisei wrote:
> > Add arch_free_pages_prepare() hook that is called before that page flags
> > are cleared. This will be used by arm64 when explicit management of tag
> > storage pages is enabled.
> 
> Can you elaborate a bit what exactly will be done by that code with that
> information?

Of course.

The MTE code that is in the kernel today uses the PG_arch_2 page flag, which it
renames to PG_mte_tagged, to track if a page has been mapped with tagging
enabled. That flag is cleared by free_pages_prepare() when it does:

page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;

When tag storage management is enabled, tag storage is reserved for a page if
and only if the page is mapped as tagged. When a page is freed, the code looks
at the PG_mte_tagged flag to determine if the page was mapped as tagged, and
therefore has tag storage reserved, to determine if the corresponding tag
storage should also be freed.

I have considered using arch_free_page(), but free_pages_prepare() calls the
function after the flags are cleared.

Does that answer your question?

Alex

> 
> -- 
> Cheers,
> 
> David / dhildenb
> 



Re: [PATCH RFC v2 05/27] mm: page_alloc: Add an arch hook to allow prep_new_page() to fail

2023-11-27 Thread Alexandru Elisei
Hi,

Thank you so much for your comments, there are genuinely useful.

On Fri, Nov 24, 2023 at 08:35:47PM +0100, David Hildenbrand wrote:
> On 19.11.23 17:56, Alexandru Elisei wrote:
> > Introduce arch_prep_new_page(), which will be used by arm64 to reserve tag
> > storage for an allocated page. Reserving tag storage can fail, for example,
> > if the tag storage page has a short pin on it, so allow prep_new_page() ->
> > arch_prep_new_page() to similarly fail.
> 
> But what are the side-effects of this? How does the calling code recover?
> 
> E.g., what if we need to populate a page into user space, but that
> particular page we allocated fails to be prepared? So we inject a signal
> into that poor process?

When the page fails to be prepared, it is put back to the tail of the
freelist with __free_one_page(.., FPI_TO_TAIL). If all the allocation paths
are exhausted and no page has been found for which tag storage has been
reserved, then that's treated like an OOM situation.

I have been thinking about this, and I think I can simplify the code by
making tag reservation a best effort approach. The page can be allocated
even if reserving tag storage fails, but the page is marked as invalid in
set_pte_at() (PAGE_NONE + an extra bit to tell arm64 that it needs tag
storage) and next time it is accessed, arm64 will reserve tag storage in
the fault handling code (the mechanism for that is implemented in patch #19
of the series, "mm: mprotect: Introduce PAGE_FAULT_ON_ACCESS for
mprotect(PROT_MTE)").

With this new approach, prep_new_page() stays the way it is, and no further
changes are required for the page allocator, as there are already arch
callbacks that can be used for that, for example tag_clear_highpage() and
arch_alloc_page(). The downside is extra page faults, which might impact
performance.

What do you think?

Thanks,
Alex

> 
> -- 
> Cheers,
> 
> David / dhildenb
> 



Re: [PATCH RFC v2 04/27] mm: migrate/mempolicy: Add hook to modify migration target gfp

2023-11-27 Thread Alexandru Elisei
Hi Mike,

I really appreciate you having a look!

On Sat, Nov 25, 2023 at 12:03:22PM +0200, Mike Rapoport wrote:
> On Sun, Nov 19, 2023 at 04:56:58PM +0000, Alexandru Elisei wrote:
> > It might be desirable for an architecture to modify the gfp flags used to
> > allocate the destination page for migration based on the page that it is
> > being replaced. For example, if an architectures has metadata associated
> > with a page (like arm64, when the memory tagging extension is implemented),
> > it can request that the destination page similarly has storage for tags
> > already allocated.
> > 
> > No functional change.
> > 
> > Signed-off-by: Alexandru Elisei 
> > ---
> >  include/linux/migrate.h | 4 
> >  mm/mempolicy.c  | 2 ++
> >  mm/migrate.c| 3 +++
> >  3 files changed, 9 insertions(+)
> > 
> > diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> > index 2ce13e8a309b..0acef592043c 100644
> > --- a/include/linux/migrate.h
> > +++ b/include/linux/migrate.h
> > @@ -60,6 +60,10 @@ struct movable_operations {
> >  /* Defined in mm/debug.c: */
> >  extern const char *migrate_reason_names[MR_TYPES];
> >  
> > +#ifndef arch_migration_target_gfp
> > +#define arch_migration_target_gfp(src, gfp) 0
> > +#endif
> > +
> >  #ifdef CONFIG_MIGRATION
> >  
> >  void putback_movable_pages(struct list_head *l);
> > diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> > index 10a590ee1c89..50bc43ab50d6 100644
> > --- a/mm/mempolicy.c
> > +++ b/mm/mempolicy.c
> > @@ -1182,6 +1182,7 @@ static struct folio 
> > *alloc_migration_target_by_mpol(struct folio *src,
> >  
> > h = folio_hstate(src);
> > gfp = htlb_alloc_mask(h);
> > +   gfp |= arch_migration_target_gfp(src, gfp);
> 
> I think it'll be more robust to have arch_migration_target_gfp() to modify
> the flags and return the new mask with added (or potentially removed)
> flags.

I did it this way so an arch won't be able to remove flags set by the MM code.
There's a similar pattern in do_mmap() -> calc_vm_flag_bits() ->
arch_calc_vm_flag_bits().

I'll change it to return the new mask if you think that's better.

Thanks,
Alex

> 
> > nodemask = policy_nodemask(gfp, pol, ilx, );
> > return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp);
> > }
> > @@ -1190,6 +1191,7 @@ static struct folio 
> > *alloc_migration_target_by_mpol(struct folio *src,
> > gfp = GFP_TRANSHUGE;
> > else
> > gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP;
> > +   gfp |= arch_migration_target_gfp(src, gfp);
> >  
> > page = alloc_pages_mpol(gfp, order, pol, ilx, nid);
> > return page_rmappable_folio(page);
> 
> -- 
> Sincerely yours,
> Mike.
> 



Re: [PATCH RFC v2 20/27] mm: hugepage: Handle huge page fault on access

2023-11-22 Thread Alexandru Elisei
Hi Peter,

On Tue, Nov 21, 2023 at 05:28:49PM -0800, Peter Collingbourne wrote:
> On Sun, Nov 19, 2023 at 8:59 AM Alexandru Elisei
>  wrote:
> >
> > Handle PAGE_FAULT_ON_ACCESS faults for huge pages in a similar way to
> > regular pages.
> >
> > Signed-off-by: Alexandru Elisei 
> > ---
> >  arch/arm64/include/asm/mte_tag_storage.h |  1 +
> >  arch/arm64/include/asm/pgtable.h |  7 ++
> >  arch/arm64/mm/fault.c| 81 
> >  include/linux/huge_mm.h  |  2 +
> >  include/linux/pgtable.h  |  5 ++
> >  mm/huge_memory.c |  4 +-
> >  mm/memory.c  |  3 +
> >  7 files changed, 101 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/arm64/include/asm/mte_tag_storage.h 
> > b/arch/arm64/include/asm/mte_tag_storage.h
> > index c70ced60a0cd..b97406d369ce 100644
> > --- a/arch/arm64/include/asm/mte_tag_storage.h
> > +++ b/arch/arm64/include/asm/mte_tag_storage.h
> > @@ -35,6 +35,7 @@ void free_tag_storage(struct page *page, int order);
> >  bool page_tag_storage_reserved(struct page *page);
> >
> >  vm_fault_t handle_page_missing_tag_storage(struct vm_fault *vmf);
> > +vm_fault_t handle_huge_page_missing_tag_storage(struct vm_fault *vmf);
> >  #else
> >  static inline bool tag_storage_enabled(void)
> >  {
> > diff --git a/arch/arm64/include/asm/pgtable.h 
> > b/arch/arm64/include/asm/pgtable.h
> > index 8cc135f1c112..1704411c096d 100644
> > --- a/arch/arm64/include/asm/pgtable.h
> > +++ b/arch/arm64/include/asm/pgtable.h
> > @@ -477,6 +477,13 @@ static inline vm_fault_t 
> > arch_do_page_fault_on_access(struct vm_fault *vmf)
> > return handle_page_missing_tag_storage(vmf);
> > return VM_FAULT_SIGBUS;
> >  }
> > +
> > +static inline vm_fault_t arch_do_huge_page_fault_on_access(struct vm_fault 
> > *vmf)
> > +{
> > +   if (tag_storage_enabled())
> > +   return handle_huge_page_missing_tag_storage(vmf);
> > +   return VM_FAULT_SIGBUS;
> > +}
> >  #endif /* CONFIG_ARCH_HAS_FAULT_ON_ACCESS */
> >
> >  #define pmd_present_invalid(pmd) (!!(pmd_val(pmd) & 
> > PMD_PRESENT_INVALID))
> > diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
> > index f5fa583acf18..6730a0812a24 100644
> > --- a/arch/arm64/mm/fault.c
> > +++ b/arch/arm64/mm/fault.c
> > @@ -1041,6 +1041,87 @@ vm_fault_t handle_page_missing_tag_storage(struct 
> > vm_fault *vmf)
> >
> > return 0;
> >
> > +out_retry:
> > +   put_page(page);
> > +   if (vmf->flags & FAULT_FLAG_VMA_LOCK)
> > +   vma_end_read(vma);
> > +   if (fault_flag_allow_retry_first(vmf->flags)) {
> > +   err = VM_FAULT_RETRY;
> > +   } else {
> > +   /* Replay the fault. */
> > +   err = 0;
> > +   }
> > +   return err;
> > +}
> > +
> > +vm_fault_t handle_huge_page_missing_tag_storage(struct vm_fault *vmf)
> > +{
> > +   unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
> > +   struct vm_area_struct *vma = vmf->vma;
> > +   pmd_t old_pmd, new_pmd;
> > +   bool writable = false;
> > +   struct page *page;
> > +   vm_fault_t err;
> > +   int ret;
> > +
> > +   vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
> > +   if (unlikely(!pmd_same(vmf->orig_pmd, *vmf->pmd))) {
> > +   spin_unlock(vmf->ptl);
> > +   return 0;
> > +   }
> > +
> > +   old_pmd = vmf->orig_pmd;
> > +   new_pmd = pmd_modify(old_pmd, vma->vm_page_prot);
> > +
> > +   /*
> > +* Detect now whether the PMD could be writable; this information
> > +* is only valid while holding the PT lock.
> > +*/
> > +   writable = pmd_write(new_pmd);
> > +   if (!writable && vma_wants_manual_pte_write_upgrade(vma) &&
> > +   can_change_pmd_writable(vma, vmf->address, new_pmd))
> > +   writable = true;
> > +
> > +   page = vm_normal_page_pmd(vma, haddr, new_pmd);
> > +   if (!page)
> > +   goto out_map;
> > +
> > +   if (!(vma->vm_flags & VM_MTE))
> > +   goto out_map;
> > +
> > +   get_page(page);
> > +   vma_set_access_pid_bit(vma);
> > +
> > +   spin_unlock(vmf->ptl);
> >

Re: [PATCH RFC 20/37] mm: compaction: Reserve metadata storage in compaction_alloc()

2023-11-21 Thread Alexandru Elisei
Hi Peter,

On Mon, Nov 20, 2023 at 08:49:32PM -0800, Peter Collingbourne wrote:
> Hi Alexandru,
> 
> On Wed, Aug 23, 2023 at 6:16 AM Alexandru Elisei
>  wrote:
> >
> > If the source page being migrated has metadata associated with it, make
> > sure to reserve the metadata storage when choosing a suitable destination
> > page from the free list.
> >
> > Signed-off-by: Alexandru Elisei 
> > ---
> >  mm/compaction.c | 9 +
> >  mm/internal.h   | 1 +
> >  2 files changed, 10 insertions(+)
> >
> > diff --git a/mm/compaction.c b/mm/compaction.c
> > index cc0139fa0cb0..af2ee3085623 100644
> > --- a/mm/compaction.c
> > +++ b/mm/compaction.c
> > @@ -570,6 +570,7 @@ static unsigned long isolate_freepages_block(struct 
> > compact_control *cc,
> > bool locked = false;
> > unsigned long blockpfn = *start_pfn;
> > unsigned int order;
> > +   int ret;
> >
> > /* Strict mode is for isolation, speed is secondary */
> > if (strict)
> > @@ -626,6 +627,11 @@ static unsigned long isolate_freepages_block(struct 
> > compact_control *cc,
> >
> > /* Found a free page, will break it into order-0 pages */
> > order = buddy_order(page);
> > +   if (metadata_storage_enabled() && cc->reserve_metadata) {
> > +   ret = reserve_metadata_storage(page, order, 
> > cc->gfp_mask);
> 
> At this point the zone lock is held and preemption is disabled, which
> makes it invalid to call reserve_metadata_storage.

You are correct, I missed that. I dropped reserving tag storage during
compaction in the next iteration, so fortunately I unintentionally fixed
it.

Thanks,
Alex

> 
> Peter
> 
> > +   if (ret)
> > +   goto isolate_fail;
> > +   }
> > isolated = __isolate_free_page(page, order);
> > if (!isolated)
> > break;
> > @@ -1757,6 +1763,9 @@ static struct folio *compaction_alloc(struct folio 
> > *src, unsigned long data)
> > struct compact_control *cc = (struct compact_control *)data;
> > struct folio *dst;
> >
> > +   if (metadata_storage_enabled())
> > +   cc->reserve_metadata = folio_has_metadata(src);
> > +
> > if (list_empty(>freepages)) {
> > isolate_freepages(cc);
> >
> > diff --git a/mm/internal.h b/mm/internal.h
> > index d28ac0085f61..046cc264bfbe 100644
> > --- a/mm/internal.h
> > +++ b/mm/internal.h
> > @@ -492,6 +492,7 @@ struct compact_control {
> >  */
> > bool alloc_contig;  /* alloc_contig_range allocation */
> > bool source_has_metadata;   /* source pages have associated 
> > metadata */
> > +   bool reserve_metadata;
> >  };
> >
> >  /*
> > --
> > 2.41.0
> >



[PATCH RFC v2 27/27] arm64: mte: Enable dynamic tag storage reuse

2023-11-19 Thread Alexandru Elisei
Everything is in place, enable tag storage management.

Signed-off-by: Alexandru Elisei 
---
 arch/arm64/kernel/mte_tag_storage.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/arm64/kernel/mte_tag_storage.c 
b/arch/arm64/kernel/mte_tag_storage.c
index 11961587382d..9f60e952a814 100644
--- a/arch/arm64/kernel/mte_tag_storage.c
+++ b/arch/arm64/kernel/mte_tag_storage.c
@@ -395,6 +395,9 @@ static int __init mte_tag_storage_activate_regions(void)
 
reserve_tag_storage(ZERO_PAGE(0), 0, GFP_HIGHUSER_MOVABLE);
 
+   static_branch_enable(_storage_enabled_key);
+   pr_info("MTE tag storage region management enabled");
+
return 0;
 
 out_disabled:
-- 
2.42.1




[PATCH RFC v2 26/27] arm64: mte: Fast track reserving tag storage when the block is free

2023-11-19 Thread Alexandru Elisei
A double digit performance decrease for Chrome startup time has been
reported with the dynamic tag storage management enabled. A large part of
the regression is due to lru_cache_disable(), called from
__alloc_contig_migrate_range(), which IPIs all CPUs in the system.

Improve the performance by taking the storage block directly from the
freelist if it's free, thus sidestepping the costly function call.

Note that at the moment this is implemented only when the block size is
1 (the block is one page); larger block sizes could be added later if
necessary.

Signed-off-by: Alexandru Elisei 
---
 arch/arm64/Kconfig  |  1 +
 arch/arm64/kernel/mte_tag_storage.c | 15 +++
 include/linux/page-flags.h  | 15 +--
 mm/Kconfig  |  4 
 mm/memory-failure.c |  8 
 mm/page_alloc.c | 21 -
 6 files changed, 49 insertions(+), 15 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 3b9c435eaafb..93a4bbca3800 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -2067,6 +2067,7 @@ config ARM64_MTE_TAG_STORAGE
bool "Dynamic MTE tag storage management"
depends on ARCH_KEEP_MEMBLOCK
select ARCH_HAS_FAULT_ON_ACCESS
+   select WANTS_TAKE_PAGE_OFF_BUDDY
select CONFIG_CMA
help
  Adds support for dynamic management of the memory used by the hardware
diff --git a/arch/arm64/kernel/mte_tag_storage.c 
b/arch/arm64/kernel/mte_tag_storage.c
index 602fdc70db1c..11961587382d 100644
--- a/arch/arm64/kernel/mte_tag_storage.c
+++ b/arch/arm64/kernel/mte_tag_storage.c
@@ -522,6 +522,7 @@ int reserve_tag_storage(struct page *page, int order, gfp_t 
gfp)
unsigned long block;
unsigned long flags;
unsigned int tries;
+   bool success;
int ret = 0;
 
VM_WARN_ON_ONCE(!preemptible());
@@ -565,6 +566,19 @@ int reserve_tag_storage(struct page *page, int order, 
gfp_t gfp)
if (tag_storage_block_is_reserved(block))
continue;
 
+   if (region->block_size == 1 && 
is_free_buddy_page(pfn_to_page(block))) {
+   success = take_page_off_buddy(pfn_to_page(block), 
false);
+   if (success) {
+   ret = tag_storage_reserve_block(block, region, 
order);
+   if (ret) {
+   put_page_back_buddy(pfn_to_page(block), 
false);
+   goto out_error;
+   }
+   page_ref_inc(pfn_to_page(block));
+   goto success_next;
+   }
+   }
+
tries = 3;
while (tries--) {
ret = alloc_contig_range(block, block + 
region->block_size, MIGRATE_CMA, gfp);
@@ -598,6 +612,7 @@ int reserve_tag_storage(struct page *page, int order, gfp_t 
gfp)
goto out_error;
}
 
+success_next:
count_vm_events(CMA_ALLOC_SUCCESS, region->block_size);
}
 
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 7915165a51bd..0d0380141f5d 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -576,11 +576,22 @@ TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
 #define MAGIC_HWPOISON 0x48575053U /* HWPS */
 extern void SetPageHWPoisonTakenOff(struct page *page);
 extern void ClearPageHWPoisonTakenOff(struct page *page);
-extern bool take_page_off_buddy(struct page *page);
-extern bool put_page_back_buddy(struct page *page);
+extern bool PageHWPoisonTakenOff(struct page *page);
 #else
 PAGEFLAG_FALSE(HWPoison, hwpoison)
+TESTSCFLAG_FALSE(HWPoison, hwpoison)
 #define __PG_HWPOISON 0
+static inline void SetPageHWPoisonTakenOff(struct page *page) { }
+static inline void ClearPageHWPoisonTakenOff(struct page *page) { }
+static inline bool PageHWPoisonTakenOff(struct page *page)
+{
+   return false;
+}
+#endif
+
+#ifdef CONFIG_WANTS_TAKE_PAGE_OFF_BUDDY
+extern bool take_page_off_buddy(struct page *page, bool poison);
+extern bool put_page_back_buddy(struct page *page, bool unpoison);
 #endif
 
 #if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
diff --git a/mm/Kconfig b/mm/Kconfig
index a90eefc3ee80..0766cdc3de4d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -773,6 +773,7 @@ config MEMORY_FAILURE
depends on MMU
depends on ARCH_SUPPORTS_MEMORY_FAILURE
bool "Enable recovery from hardware memory errors"
+   select WANTS_TAKE_PAGE_OFF_BUDDY
select MEMORY_ISOLATION
select RAS
help
@@ -1022,6 +1023,9 @@ config ARCH_HAS_CACHE_LINE_SIZE
 config ARCH_HAS_FAULT_ON_ACCESS
bool
 
+config WANTS_TAKE_PAGE_OFF_BUDDY
+   bool
+
 config ARCH_HAS_CURRENT_STACK_POINTER
bool
 

[PATCH RFC v2 24/27] arm64: mte: Handle fatal signal in reserve_tag_storage()

2023-11-19 Thread Alexandru Elisei
As long as a fatal signal is pending, alloc_contig_range() will fail with
-EINTR. This makes it impossible for tag storage allocation to succeed, and
the page allocator will print an OOM splat.

The process is going to be killed, so return 0 (success) from
reserve_tag_storage() to allow the page allocator to make progress.
set_pte_at() will map it with PAGE_FAULT_ON_ACCESS and subsequent accesses
from different threads will cause a fault until the signal is delivered.

Signed-off-by: Alexandru Elisei 
---
 arch/arm64/kernel/mte_tag_storage.c | 17 +
 arch/arm64/mm/fault.c   |  5 +
 2 files changed, 22 insertions(+)

diff --git a/arch/arm64/kernel/mte_tag_storage.c 
b/arch/arm64/kernel/mte_tag_storage.c
index 6b11bb408b51..602fdc70db1c 100644
--- a/arch/arm64/kernel/mte_tag_storage.c
+++ b/arch/arm64/kernel/mte_tag_storage.c
@@ -572,6 +572,23 @@ int reserve_tag_storage(struct page *page, int order, 
gfp_t gfp)
break;
}
 
+   /*
+* alloc_contig_range() returns -EINTR from
+* __alloc_contig_migrate_range() if a fatal signal is pending.
+* As long as the signal hasn't been handled, it is impossible
+* to reserve tag storage for any page. Stop trying to reserve
+* tag storage, but return 0 so the page allocator can make
+* forward progress, instead of printing an OOM splat.
+*
+* The tagged page with missing tag storage will be mapped with
+* PAGE_FAULT_ON_ACCESS in set_pte_at(), which means accesses
+* until the signal is delivered will cause a fault.
+*/
+   if (ret == -EINTR) {
+   ret = 0;
+   goto out_error;
+   }
+
if (ret)
goto out_error;
 
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 964c5ae161a3..fdc98c5828bf 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -950,6 +950,11 @@ gfp_t arch_calc_vma_gfp(struct vm_area_struct *vma, gfp_t 
gfp)
 
 void tag_clear_highpage(struct page *page)
 {
+   if (tag_storage_enabled() && 
unlikely(!page_tag_storage_reserved(page))) {
+   clear_page(page_address(page));
+   return;
+   }
+
/* Newly allocated page, shouldn't have been tagged yet */
WARN_ON_ONCE(!try_page_mte_tagging(page));
mte_zero_clear_page_tags(page_address(page));
-- 
2.42.1




[PATCH RFC v2 25/27] KVM: arm64: Disable MTE if tag storage is enabled

2023-11-19 Thread Alexandru Elisei
KVM allows MTE enabled VMs to be created when the backing VMA does not have
MTE enabled.  Without changes to how KVM allocates memory for a VM, it is
impossible at the moment to discern when the corresponding tag storage
needs to be reserved.

For now, disable MTE in KVM if tag storage is enabled.

Signed-off-by: Alexandru Elisei 
---
 arch/arm64/kvm/arm.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index e5f75f1f1085..5b33c532c62a 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -86,7 +87,8 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
break;
case KVM_CAP_ARM_MTE:
mutex_lock(>lock);
-   if (!system_supports_mte() || kvm->created_vcpus) {
+   if (!system_supports_mte() || tag_storage_enabled() ||
+   kvm->created_vcpus) {
r = -EINVAL;
} else {
r = 0;
@@ -279,7 +281,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = 1;
break;
case KVM_CAP_ARM_MTE:
-   r = system_supports_mte();
+   r = system_supports_mte() && !tag_storage_enabled();
break;
case KVM_CAP_STEAL_TIME:
r = kvm_arm_pvtime_supported();
-- 
2.42.1




[PATCH RFC v2 23/27] arm64: mte: copypage: Handle tag restoring when missing tag storage

2023-11-19 Thread Alexandru Elisei
There are several situations where copy_highpage() can end up copying
tags to a page which doesn't have its tag storage reserved.

One situation involves migration racing with mprotect(PROT_MTE): VMA is
initially untagged, migration starts and destination page is allocated
as untagged, mprotect(PROT_MTE) changes the VMA to tagged and userspace
accesses the source page, thus making it tagged.  The migration code
then calls copy_highpage(), which will copy the tags from the source
page (now tagged) to the destination page (allocated as untagged).

Yes another situation can happen during THP collapse. The huge page that
will replace the HPAGE_PMD_NR contiguous mapped pages is allocated with
__GFP_TAGGED not set. copy_highpage() will copy the tags from the pages
being replaced to the huge page which doesn't have tag storage reserved.

The situation gets even more complicated when the replacement huge page
is a tag storage page. The tag storage huge page will be migrated after
a fault on access, but the tags from the original pages must be copied
over to the huge page that will be replacing the tag storage huge page.

Signed-off-by: Alexandru Elisei 
---
 arch/arm64/mm/copypage.c | 59 
 1 file changed, 59 insertions(+)

diff --git a/arch/arm64/mm/copypage.c b/arch/arm64/mm/copypage.c
index a7bb20055ce0..7899f38773b9 100644
--- a/arch/arm64/mm/copypage.c
+++ b/arch/arm64/mm/copypage.c
@@ -13,6 +13,62 @@
 #include 
 #include 
 #include 
+#include 
+
+#ifdef CONFIG_ARM64_MTE_TAG_STORAGE
+static inline bool try_transfer_saved_tags(struct page *from, struct page *to)
+{
+   void *tags;
+   bool saved;
+
+   VM_WARN_ON_ONCE(!preemptible());
+
+   if (page_mte_tagged(from)) {
+   if (likely(page_tag_storage_reserved(to)))
+   return false;
+
+   tags = mte_allocate_tag_buf();
+   if (WARN_ON(!tags))
+   return true;
+
+   mte_copy_page_tags_to_buf(page_address(from), tags);
+   saved = mte_save_tags_for_pfn(tags, page_to_pfn(to));
+   if (!saved)
+   mte_free_tag_buf(tags);
+
+   return saved;
+   }
+
+   if (likely(!page_is_tag_storage(from)))
+   return false;
+
+   tags_by_pfn_lock();
+   tags = mte_erase_tags_for_pfn(page_to_pfn(from));
+   tags_by_pfn_unlock();
+
+   if (likely(!tags))
+   return false;
+
+   if (page_tag_storage_reserved(to)) {
+   WARN_ON_ONCE(!try_page_mte_tagging(to));
+   mte_copy_page_tags_from_buf(page_address(to), tags);
+   set_page_mte_tagged(to);
+   mte_free_tag_buf(tags);
+   return true;
+   }
+
+   saved = mte_save_tags_for_pfn(tags, page_to_pfn(to));
+   if (!saved)
+   mte_free_tag_buf(tags);
+
+   return saved;
+}
+#else
+static inline bool try_transfer_saved_tags(struct page *from, struct page *to)
+{
+   return false;
+}
+#endif
 
 void copy_highpage(struct page *to, struct page *from)
 {
@@ -24,6 +80,9 @@ void copy_highpage(struct page *to, struct page *from)
if (kasan_hw_tags_enabled())
page_kasan_tag_reset(to);
 
+   if (tag_storage_enabled() && try_transfer_saved_tags(from, to))
+   return;
+
if (system_supports_mte() && page_mte_tagged(from)) {
/* It's a new page, shouldn't have been tagged yet */
WARN_ON_ONCE(!try_page_mte_tagging(to));
-- 
2.42.1




[PATCH RFC v2 22/27] arm64: mte: swap: Handle tag restoring when missing tag storage

2023-11-19 Thread Alexandru Elisei
Linux restores tags when a page is swapped in and there are tags associated
with the swap entry which the new page will replace. The saved tags are
restored even if the page will not be mapped as tagged, to protect against
cases where the page is shared between different VMAs, and is tagged in
some, but untagged in others. By using this approach, the process can still
access the correct tags following an mprotect(PROT_MTE) on the non-MTE
enabled VMA.

But this poses a challenge for managing tag storage: in the scenario above,
when a new page is allocated to be swapped in for the process where it will
be mapped as untagged, the corresponding tag storage block is not reserved.
mte_restore_page_tags_by_swp_entry(), when it restores the saved tags, will
overwrite data in the tag storage block associated with the new page,
leading to data corruption if the block is in use by a process.

Get around this issue by saving the tags in a new xarray, this time indexed
by the page pfn, and then restoring them when tag storage is reserved for
the page.

Signed-off-by: Alexandru Elisei 
---
 arch/arm64/include/asm/mte_tag_storage.h |   9 ++
 arch/arm64/include/asm/pgtable.h |  11 +++
 arch/arm64/kernel/mte_tag_storage.c  |  20 +++-
 arch/arm64/mm/mteswap.c  | 112 +++
 4 files changed, 148 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/mte_tag_storage.h 
b/arch/arm64/include/asm/mte_tag_storage.h
index 6a8b19a6a758..a3c38099fe1a 100644
--- a/arch/arm64/include/asm/mte_tag_storage.h
+++ b/arch/arm64/include/asm/mte_tag_storage.h
@@ -37,6 +37,15 @@ bool page_is_tag_storage(struct page *page);
 
 vm_fault_t handle_page_missing_tag_storage(struct vm_fault *vmf);
 vm_fault_t handle_huge_page_missing_tag_storage(struct vm_fault *vmf);
+
+void tags_by_pfn_lock(void);
+void tags_by_pfn_unlock(void);
+
+void *mte_erase_tags_for_pfn(unsigned long pfn);
+bool mte_save_tags_for_pfn(void *tags, unsigned long pfn);
+void mte_restore_tags_for_pfn(unsigned long start_pfn, int order);
+
+vm_fault_t mte_try_transfer_swap_tags(swp_entry_t entry, struct page *page);
 #else
 static inline bool tag_storage_enabled(void)
 {
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 1704411c096d..1a25b7d601c2 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1084,6 +1084,17 @@ static inline void arch_swap_invalidate_area(int type)
mte_invalidate_tags_area_by_swp_entry(type);
 }
 
+#ifdef CONFIG_ARM64_MTE_TAG_STORAGE
+#define __HAVE_ARCH_SWAP_PREPARE_TO_RESTORE
+static inline vm_fault_t arch_swap_prepare_to_restore(swp_entry_t entry,
+ struct folio *folio)
+{
+   if (tag_storage_enabled())
+   return mte_try_transfer_swap_tags(entry, >page);
+   return 0;
+}
+#endif
+
 #define __HAVE_ARCH_SWAP_RESTORE
 static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
 {
diff --git a/arch/arm64/kernel/mte_tag_storage.c 
b/arch/arm64/kernel/mte_tag_storage.c
index 5096ce859136..6b11bb408b51 100644
--- a/arch/arm64/kernel/mte_tag_storage.c
+++ b/arch/arm64/kernel/mte_tag_storage.c
@@ -547,8 +547,10 @@ int reserve_tag_storage(struct page *page, int order, 
gfp_t gfp)
mutex_lock(_blocks_lock);
 
/* Check again, this time with the lock held. */
-   if (page_tag_storage_reserved(page))
-   goto out_unlock;
+   if (page_tag_storage_reserved(page)) {
+   mutex_unlock(_blocks_lock);
+   return 0;
+   }
 
/* Make sure existing entries are not freed from out under out feet. */
xa_lock_irqsave(_blocks_reserved, flags);
@@ -583,9 +585,10 @@ int reserve_tag_storage(struct page *page, int order, 
gfp_t gfp)
}
 
page_set_tag_storage_reserved(page, order);
-out_unlock:
mutex_unlock(_blocks_lock);
 
+   mte_restore_tags_for_pfn(page_to_pfn(page), order);
+
return 0;
 
 out_error:
@@ -612,7 +615,8 @@ void free_tag_storage(struct page *page, int order)
struct tag_region *region;
unsigned long page_va;
unsigned long flags;
-   int ret;
+   void *tags;
+   int i, ret;
 
ret = tag_storage_find_block(page, _block, );
if (WARN_ONCE(ret, "Missing tag storage block for pfn 0x%lx", 
page_to_pfn(page)))
@@ -622,6 +626,14 @@ void free_tag_storage(struct page *page, int order)
/* Avoid writeback of dirty tag cache lines corrupting data. */
dcache_inval_tags_poc(page_va, page_va + (PAGE_SIZE << order));
 
+   tags_by_pfn_lock();
+   for (i = 0; i < (1 << order); i++) {
+   tags = mte_erase_tags_for_pfn(page_to_pfn(page + i));
+   if (unlikely(tags))
+   mte_free_tag_buf(tags);
+   }
+   tags_by_pfn_unlock();
+
end_block = start_block + order_to_num_blocks(

  1   2   3   >