From: "Mike Rapoport (IBM)" <r...@kernel.org>

Using large pages to map text areas reduces iTLB pressure and improves
performance.

Extend execmem_alloc() with an ability to use PMD_SIZE'ed pages with ROX
permissions as a cache for smaller allocations.

To populate the cache, a writable large page is allocated from vmalloc with
VM_ALLOW_HUGE_VMAP, filled with invalid instructions and then remapped as
ROX.

Portions of that large page are handed out to execmem_alloc() callers
without any changes to the permissions.

When the memory is freed with execmem_free() it is invalidated again so
that it won't contain stale instructions.

The cache is enabled when an architecture sets EXECMEM_ROX_CACHE flag in
definition of an execmem_range.

Signed-off-by: Mike Rapoport (IBM) <r...@kernel.org>
---
 include/linux/execmem.h |   2 +
 mm/execmem.c            | 267 ++++++++++++++++++++++++++++++++++++++--
 2 files changed, 262 insertions(+), 7 deletions(-)

diff --git a/include/linux/execmem.h b/include/linux/execmem.h
index 9d22999dbd7d..06f678e6fe55 100644
--- a/include/linux/execmem.h
+++ b/include/linux/execmem.h
@@ -77,12 +77,14 @@ struct execmem_range {
 
 /**
  * struct execmem_info - architecture parameters for code allocations
+ * @invalidate: set memory to contain invalid instructions
  * @ranges: array of parameter sets defining architecture specific
  * parameters for executable memory allocations. The ranges that are not
  * explicitly initialized by an architecture use parameters defined for
  * @EXECMEM_DEFAULT.
  */
 struct execmem_info {
+       void (*invalidate)(void *ptr, size_t size, bool writable);
        struct execmem_range    ranges[EXECMEM_TYPE_MAX];
 };
 
diff --git a/mm/execmem.c b/mm/execmem.c
index c920d2b5a721..716fba68ab0e 100644
--- a/mm/execmem.c
+++ b/mm/execmem.c
@@ -1,30 +1,88 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include <linux/mm.h>
+#include <linux/mutex.h>
 #include <linux/vmalloc.h>
 #include <linux/execmem.h>
+#include <linux/maple_tree.h>
 #include <linux/moduleloader.h>
 #include <linux/text-patching.h>
 
+#include <asm/tlbflush.h>
+
+#include "internal.h"
+
 static struct execmem_info *execmem_info __ro_after_init;
 static struct execmem_info default_execmem_info __ro_after_init;
 
-static void *__execmem_alloc(struct execmem_range *range, size_t size)
+struct execmem_cache {
+       struct mutex mutex;
+       struct maple_tree busy_areas;
+       struct maple_tree free_areas;
+};
+
+static struct execmem_cache execmem_cache = {
+       .mutex = __MUTEX_INITIALIZER(execmem_cache.mutex),
+       .busy_areas = MTREE_INIT_EXT(busy_areas, MT_FLAGS_LOCK_EXTERN,
+                                    execmem_cache.mutex),
+       .free_areas = MTREE_INIT_EXT(free_areas, MT_FLAGS_LOCK_EXTERN,
+                                    execmem_cache.mutex),
+};
+
+static void execmem_cache_clean(struct work_struct *work)
+{
+       struct maple_tree *free_areas = &execmem_cache.free_areas;
+       struct mutex *mutex = &execmem_cache.mutex;
+       MA_STATE(mas, free_areas, 0, ULONG_MAX);
+       void *area;
+
+       mutex_lock(mutex);
+       mas_for_each(&mas, area, ULONG_MAX) {
+               size_t size;
+
+               if (!xa_is_value(area))
+                       continue;
+
+               size = xa_to_value(area);
+
+               if (IS_ALIGNED(size, PMD_SIZE) && IS_ALIGNED(mas.index, 
PMD_SIZE)) {
+                       void *ptr = (void *)mas.index;
+
+                       mas_erase(&mas);
+                       vfree(ptr);
+               }
+       }
+       mutex_unlock(mutex);
+}
+
+static DECLARE_WORK(execmem_cache_clean_work, execmem_cache_clean);
+
+static void execmem_invalidate(void *ptr, size_t size, bool writable)
+{
+       if (execmem_info->invalidate)
+               execmem_info->invalidate(ptr, size, writable);
+       else
+               memset(ptr, 0, size);
+}
+
+static void *execmem_vmalloc(struct execmem_range *range, size_t size,
+                            pgprot_t pgprot, unsigned long vm_flags)
 {
        bool kasan = range->flags & EXECMEM_KASAN_SHADOW;
-       unsigned long vm_flags  = VM_FLUSH_RESET_PERMS;
        gfp_t gfp_flags = GFP_KERNEL | __GFP_NOWARN;
+       unsigned int align = range->alignment;
        unsigned long start = range->start;
        unsigned long end = range->end;
-       unsigned int align = range->alignment;
-       pgprot_t pgprot = range->pgprot;
        void *p;
 
        if (kasan)
                vm_flags |= VM_DEFER_KMEMLEAK;
 
-       p = __vmalloc_node_range(size, align, start, end, gfp_flags,
-                                pgprot, vm_flags, NUMA_NO_NODE,
+       if (vm_flags & VM_ALLOW_HUGE_VMAP)
+               align = PMD_SIZE;
+
+       p = __vmalloc_node_range(size, align, start, end, gfp_flags, pgprot,
+                                vm_flags, NUMA_NO_NODE,
                                 __builtin_return_address(0));
        if (!p && range->fallback_start) {
                start = range->fallback_start;
@@ -44,6 +102,199 @@ static void *__execmem_alloc(struct execmem_range *range, 
size_t size)
                return NULL;
        }
 
+       return p;
+}
+
+static int execmem_cache_add(void *ptr, size_t size)
+{
+       struct maple_tree *free_areas = &execmem_cache.free_areas;
+       struct mutex *mutex = &execmem_cache.mutex;
+       unsigned long addr = (unsigned long)ptr;
+       MA_STATE(mas, free_areas, addr - 1, addr + 1);
+       unsigned long lower, lower_size = 0;
+       unsigned long upper, upper_size = 0;
+       unsigned long area_size;
+       void *area = NULL;
+       int err;
+
+       lower = addr;
+       upper = addr + size - 1;
+
+       mutex_lock(mutex);
+       area = mas_walk(&mas);
+       if (area && xa_is_value(area) && mas.last == addr - 1) {
+               lower = mas.index;
+               lower_size = xa_to_value(area);
+       }
+
+       area = mas_next(&mas, ULONG_MAX);
+       if (area && xa_is_value(area) && mas.index == addr + size) {
+               upper = mas.last;
+               upper_size = xa_to_value(area);
+       }
+
+       mas_set_range(&mas, lower, upper);
+       area_size = lower_size + upper_size + size;
+       err = mas_store_gfp(&mas, xa_mk_value(area_size), GFP_KERNEL);
+       mutex_unlock(mutex);
+       if (err)
+               return -ENOMEM;
+
+       return 0;
+}
+
+static void *__execmem_cache_alloc(size_t size)
+{
+       struct maple_tree *free_areas = &execmem_cache.free_areas;
+       struct maple_tree *busy_areas = &execmem_cache.busy_areas;
+       MA_STATE(mas_free, free_areas, 0, ULONG_MAX);
+       MA_STATE(mas_busy, busy_areas, 0, ULONG_MAX);
+       struct mutex *mutex = &execmem_cache.mutex;
+       unsigned long addr, last, area_size = 0;
+       void *area, *ptr = NULL;
+       int err;
+
+       mutex_lock(mutex);
+       mas_for_each(&mas_free, area, ULONG_MAX) {
+               area_size = xa_to_value(area);
+               if (area_size >= size)
+                       break;
+       }
+
+       if (area_size < size)
+               goto out_unlock;
+
+       addr = mas_free.index;
+       last = mas_free.last;
+
+       /* insert allocated size to busy_areas at range [addr, addr + size) */
+       mas_set_range(&mas_busy, addr, addr + size - 1);
+       err = mas_store_gfp(&mas_busy, xa_mk_value(size), GFP_KERNEL);
+       if (err)
+               goto out_unlock;
+
+       mas_erase(&mas_free);
+       if (area_size > size) {
+               /*
+                * re-insert remaining free size to free_areas at range
+                * [addr + size, last]
+                */
+               mas_set_range(&mas_free, addr + size, last);
+               size = area_size - size;
+               err = mas_store_gfp(&mas_free, xa_mk_value(size), GFP_KERNEL);
+               if (err) {
+                       mas_erase(&mas_busy);
+                       goto out_unlock;
+               }
+       }
+       ptr = (void *)addr;
+
+out_unlock:
+       mutex_unlock(mutex);
+       return ptr;
+}
+
+static int execmem_cache_populate(struct execmem_range *range, size_t size)
+{
+       unsigned long vm_flags = VM_FLUSH_RESET_PERMS | VM_ALLOW_HUGE_VMAP;
+       unsigned long start, end;
+       struct vm_struct *vm;
+       size_t alloc_size;
+       int err = -ENOMEM;
+       void *p;
+
+       alloc_size = round_up(size, PMD_SIZE);
+       p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags);
+       if (!p)
+               return err;
+
+       vm = find_vm_area(p);
+       if (!vm)
+               goto err_free_mem;
+
+       /* fill memory with invalid instructions */
+       execmem_invalidate(p, alloc_size, /* writable = */ true);
+
+       start = (unsigned long)p;
+       end = start + alloc_size;
+
+       vunmap_range_noflush(start, end);
+       flush_tlb_kernel_range(start, end);
+
+       /* FIXME: handle direct map alias */
+
+       err = vmap_pages_range_noflush(start, end, range->pgprot, vm->pages,
+                                      PMD_SHIFT);
+       if (err)
+               goto err_free_mem;
+
+       err = execmem_cache_add(p, alloc_size);
+       if (err)
+               goto err_free_mem;
+
+       return 0;
+
+err_free_mem:
+       vfree(p);
+       return err;
+}
+
+static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
+{
+       void *p;
+       int err;
+
+       p = __execmem_cache_alloc(size);
+       if (p)
+               return p;
+
+       err = execmem_cache_populate(range, size);
+       if (err)
+               return NULL;
+
+       return __execmem_cache_alloc(size);
+}
+
+static bool execmem_cache_free(void *ptr)
+{
+       struct maple_tree *busy_areas = &execmem_cache.busy_areas;
+       struct mutex *mutex = &execmem_cache.mutex;
+       unsigned long addr = (unsigned long)ptr;
+       MA_STATE(mas, busy_areas, addr, addr);
+       size_t size;
+       void *area;
+
+       mutex_lock(mutex);
+       area = mas_walk(&mas);
+       if (!area) {
+               mutex_unlock(mutex);
+               return false;
+       }
+       size = xa_to_value(area);
+       mas_erase(&mas);
+       mutex_unlock(mutex);
+
+       execmem_invalidate(ptr, size, /* writable = */ false);
+
+       execmem_cache_add(ptr, size);
+
+       schedule_work(&execmem_cache_clean_work);
+
+       return true;
+}
+
+static void *__execmem_alloc(struct execmem_range *range, size_t size)
+{
+       bool use_cache = range->flags & EXECMEM_ROX_CACHE;
+       unsigned long vm_flags = VM_FLUSH_RESET_PERMS;
+       pgprot_t pgprot = range->pgprot;
+       void *p;
+
+       if (use_cache)
+               p = execmem_cache_alloc(range, size);
+       else
+               p = execmem_vmalloc(range, size, pgprot, vm_flags);
+
        return kasan_reset_tag(p);
 }
 
@@ -61,7 +312,9 @@ void execmem_free(void *ptr)
         * supported by vmalloc.
         */
        WARN_ON(in_interrupt());
-       vfree(ptr);
+
+       if (!execmem_cache_free(ptr))
+               vfree(ptr);
 }
 
 void *execmem_update_copy(void *dst, const void *src, size_t size)
-- 
2.43.0

Reply via email to