On AMD GCN, for each kernel that we execute on the GPUs, the vast
majority of the time preparing the kernel for execution is spent in
memory allocation and deallocation for the kernel arguments.  Out of the
total execution time of run_kernel, which is the GCN plugin function
that actually performs launching a kernel, ~83.5% of execution time is
spent in these (de)allocation routines.

Obviously, then, these calls should be elliminated.  However, it is not
possible to avoid needing to allocate kernel arguments.

To this end, this patch implements a cache of kernel argument
allocations.

We expect this cache to be of size T where T is the maximum number of
kernels being launched in parallel.  This should be a fairly small
number, as there isn't much benefit to (or, to my awareness, real world
code that) executing very many kernels in parallel.

As the kernel argument allocations are of differing sizes, there is
benefit in not making too many disparate classes of sizes.  To prevent
this happening, the plugin rounds up the size of the varying part of the
kernel argument allocation (the target variable table) to a multiple of
sixty-four pointers, and ensures that this size is never zero.  This
should result in the vast majority of allocations being able to reuse
the same cache nodes.

In my experiments (with BabelStream, though this should by no means be
improvements specific to it as run_kernel is used for all kernels and
branches very little), this was able to cut the non-kernel-wait runtime
of run_kernel by a factor of 5.5x.

Cumulatively, this patch and the previous improved BabelStream results
in the default configuration by 47% in the Copy kernel, and 35.5% on
average across all kernels.

libgomp/ChangeLog:

        * plugin/plugin-gcn.c (struct kernel_dispatch): Add a field to
        hold a pointer to the allocation cache node this dispatch is
        holding for kernel arguments, replacing kernarg_address.
        (print_kernel_dispatch): Print the allocation pointer from that
        node as kernargs address.
        (struct agent_info): Add in an allocation cache field.
        (alloc_kernargs_on_agent): New function.  Pulls kernel arguments
        from the cache, or, if no appropriate node is found, allocates
        new ones.
        (create_kernel_dispatch): Round HOST_VARS_SIZE to a multiple of
        64 pointers, and ensure it is nonzero.  Use
        alloc_kernargs_on_agent to allocate kernargs.
        (release_kernel_dispatch): Use release_alloc_cache_node to
        release kernargs.
        (run_kernel): Update usages of kernarg_address to use the kernel
        arguments cache node.
        (GOMP_OFFLOAD_fini_device): Clean up kernargs cache.
        (GOMP_OFFLOAD_init_device): Initialize kernargs cache.
        * alloc_cache.h: New file.
        * testsuite/libgomp.c/alloc_cache-1.c: New test.
---
 libgomp/alloc_cache.h                       | 144 ++++++++++++++++++++
 libgomp/plugin/plugin-gcn.c                 |  87 ++++++++++--
 libgomp/testsuite/libgomp.c/alloc_cache-1.c |  62 +++++++++
 3 files changed, 279 insertions(+), 14 deletions(-)
 create mode 100644 libgomp/alloc_cache.h
 create mode 100644 libgomp/testsuite/libgomp.c/alloc_cache-1.c

diff --git a/libgomp/alloc_cache.h b/libgomp/alloc_cache.h
new file mode 100644
index 000000000000..782569c1faec
--- /dev/null
+++ b/libgomp/alloc_cache.h
@@ -0,0 +1,144 @@
+/* A simple allocation cache.
+   Copyright (C) 2026 Free Software Foundation, Inc.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _ALLOC_CACHE_H
+#define _ALLOC_CACHE_H
+
+#include <assert.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <stdatomic.h>
+#include <stddef.h>
+#include <pthread.h>
+
+/* A single cached allocation.  All fields immutable.  */
+struct alloc_cache_node
+{
+  /* When taken, someone is using this node, and we can't.  */
+  pthread_mutex_t lock;
+  void *allocation;
+  size_t size;
+
+  struct alloc_cache_node *next;
+};
+
+struct alloc_cache
+{
+  _Atomic (struct alloc_cache_node *) head;
+
+  /* Could be made better by breaking it up into buckets eventually.  Our
+     current allocation pattern is such that most accesses are likely to only
+     ever use the smallest practical allocation, so there isn't much gain in
+     implementing buckets currently.
+
+     Currently, as it is used, this cache will likely be of size O(T) where T
+     is the max number of concurrently executing kernels during the lifetime of
+     the process.  I suspect this value is low, so even with a single bucket,
+     it is likely fast enough to search through.  */
+};
+
+/* Prepare CACHE for use, initializing it as empty.  */
+static inline void
+init_alloc_cache (struct alloc_cache *cache)
+{
+  atomic_init (&cache->head, NULL);
+}
+
+/* Search through CACHE, looking for a non-taken node of large enough to fit
+   DESIRED_SIZE bytes.  Returns NULL if no such node exists.  */
+static inline struct alloc_cache_node *
+alloc_cache_try_find (struct alloc_cache *cache, size_t desired_size)
+{
+  for (struct alloc_cache_node *node =
+        atomic_load_explicit (&cache->head, memory_order_relaxed);
+       node;
+       node = node->next)
+    {
+      if (node->size < desired_size)
+       continue;
+
+      int ret;
+      if ((ret = pthread_mutex_trylock (&node->lock)) == EBUSY)
+       continue;
+      assert (ret == 0);
+
+      /* It worked!  We found a node that's large enough and free.  */
+      return node;
+    }
+
+  return NULL;
+}
+
+/* Add a new node for allocation ALLOCATION of SIZE bytes into the cache.  The
+   new node is acquired on return.  */
+static inline struct alloc_cache_node *
+alloc_cache_add_taken_node (struct alloc_cache *cache,
+                           void *allocation,
+                           size_t size)
+{
+  struct alloc_cache_node *new_node = malloc (sizeof (*new_node));
+
+  if (!new_node)
+    return NULL;
+
+  *new_node = (struct alloc_cache_node) {
+    .lock = PTHREAD_MUTEX_INITIALIZER,
+    .allocation = allocation,
+    .size = size,
+    .next = NULL
+  };
+  pthread_mutex_lock (&new_node->lock);
+
+  /* Place it on the top of the stack.  */
+  struct alloc_cache_node *top = (atomic_load_explicit
+                                 (&cache->head, memory_order_acquire));
+
+  do new_node->next = top;
+  while (!atomic_compare_exchange_weak_explicit
+        (&cache->head, &top, new_node,
+         memory_order_acq_rel, memory_order_acquire));
+
+  return new_node;
+}
+
+/* Allow NODE to be used by other users of its cache.  */
+static inline void
+release_alloc_cache_node (struct alloc_cache_node *node)
+{
+  pthread_mutex_unlock (&node->lock);
+}
+
+/* Destroy NODE.  Caller is responsible for cleaning up the allocation inside
+   of NODE, and for making sure that it is not part of any cache that is going
+   to be used in the future.  */
+static inline void
+destroy_alloc_cache_node (struct alloc_cache_node *node)
+{
+  pthread_mutex_destroy (&node->lock);
+  free (node);
+}
+
+
+#endif /* _ALLOC_CACHE_H  */
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index 3b94b825cdbd..63fc1ddfae6a 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -41,6 +41,7 @@
 #include <hsa_ext_amd.h>
 #include <dlfcn.h>
 #include <signal.h>
+#include "alloc_cache.h"
 #define _LIBGOMP_PLUGIN_INCLUDE 1
 #include "libgomp-plugin.h"
 #undef _LIBGOMP_PLUGIN_INCLUDE
@@ -281,8 +282,9 @@ struct kernel_dispatch
   struct agent_info *agent;
   /* Pointer to a command queue associated with a kernel dispatch agent.  */
   void *queue;
-  /* Pointer to a memory space used for kernel arguments passing.  */
-  void *kernarg_address;
+  /* Pointer to a memory space used for kernel arguments passing, wrapped in a
+     node from the agent kernel argument cache.  */
+  struct alloc_cache_node *kernarg_cache_node;
   /* Kernel object.  */
   uint64_t object;
   /* Synchronization signal used for dispatch synchronization.  */
@@ -477,6 +479,10 @@ struct agent_info
   /* The HSA memory region from which to allocate kernel arguments.  */
   hsa_region_t kernarg_region;
 
+  /* A stack of allocations in kernarg_region of (sizeof (struct kernargs))
+     size each, used for ammortizing kernel argument allocation cost.  */
+  struct alloc_cache kernarg_cache;
+
   /* The HSA memory region from which to allocate device data.  */
   hsa_region_t data_region;
 
@@ -1087,7 +1093,7 @@ dump_executable_symbols (hsa_executable_t executable)
 static void
 print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
 {
-  struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
+  struct kernargs *kernargs = dispatch->kernarg_cache_node->allocation;
 
   fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
   fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
@@ -2009,6 +2015,34 @@ alloc_by_agent (struct agent_info *agent, size_t size)
   return ptr;
 }
 
+/* Get a cached kernargs from AGENT, returning an existing one if any are
+   available.  Returns an alloc_cache_node whose value is this allocation.  */
+
+static struct alloc_cache_node *
+alloc_kernargs_on_agent (struct agent_info *agent, size_t size)
+{
+  struct alloc_cache_node *ka_node = (alloc_cache_try_find
+                                     (&agent->kernarg_cache, size));
+
+  /* The cache was empty.  */
+  if (!ka_node)
+    {
+      void *ka_addr;
+      hsa_status_t status = hsa_fns.hsa_memory_allocate_fn
+       (agent->kernarg_region, sizeof (struct kernargs), &ka_addr);
+      if (status != HSA_STATUS_SUCCESS)
+       hsa_fatal ("Could not allocate memory for GCN kernel arguments", 
status);
+
+      ka_node = alloc_cache_add_taken_node (&agent->kernarg_cache,
+                                           ka_addr,
+                                           size);
+      if (!ka_node)
+       GOMP_PLUGIN_fatal ("Could not allocate cache node for kernel 
arguments");
+    }
+
+  return ka_node;
+}
+
 /* Create kernel dispatch data structure for given KERNEL, along with
    the necessary device signals and memory allocations.
 
@@ -2062,12 +2096,23 @@ create_kernel_dispatch (struct kernel_info *kernel, int 
num_teams,
       return NULL;
     }
 
-  status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
-                                          sizeof (struct kernargs) + 
host_vars_size,
-                                          &shadow->kernarg_address);
-  if (status != HSA_STATUS_SUCCESS)
-    hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
-  struct kernargs *kernargs = shadow->kernarg_address;
+  /* To increase chance of cache hit, round up size of the target variable
+     table to a multiple of (64*sizeof(void*)), and ensure that this size is
+     nonzero.  */
+  if (!host_vars_size)
+    host_vars_size++;
+
+  {
+    constexpr size_t rounding_factor = 64 * sizeof (void*);
+    host_vars_size += rounding_factor - 1;
+    host_vars_size = host_vars_size / rounding_factor * rounding_factor;
+  }
+
+  /* Get an allocation, if possible from the cache.  */
+  shadow->kernarg_cache_node = (alloc_kernargs_on_agent
+                               (agent,
+                                sizeof (struct kernargs) + host_vars_size));
+  struct kernargs *kernargs = shadow->kernarg_cache_node->allocation;
 
   /* Zero-initialize the output_data (minimum needed).  */
   kernargs->abi.out_ptr = (int64_t)&kernargs->output_data;
@@ -2166,13 +2211,13 @@ release_kernel_dispatch (struct kernel_dispatch *shadow)
 {
   GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
 
-  struct kernargs *kernargs = shadow->kernarg_address;
+  struct kernargs *kernargs = shadow->kernarg_cache_node->allocation;
   void *addr = (void *)kernargs->abi.arena_ptr;
   if (!addr)
     addr = (void *)kernargs->abi.stack_ptr;
   release_ephemeral_memories (shadow->agent, addr);
 
-  hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
+  release_alloc_cache_node (shadow->kernarg_cache_node);
 
   hsa_signal_t s;
   s.handle = shadow->signal;
@@ -2420,7 +2465,7 @@ run_kernel (struct kernel_info *kernel, void *vars,
   packet->private_segment_size = shadow->private_segment_size;
   packet->group_segment_size = shadow->group_segment_size;
   packet->kernel_object = shadow->object;
-  packet->kernarg_address = shadow->kernarg_address;
+  packet->kernarg_address = shadow->kernarg_cache_node->allocation;
   hsa_signal_t s;
   s.handle = shadow->signal;
   packet->completion_signal = s;
@@ -2469,9 +2514,9 @@ run_kernel (struct kernel_info *kernel, void *vars,
                                             1000 * 1000,
                                             HSA_WAIT_STATE_BLOCKED) != 0)
     {
-      console_output (kernel, shadow->kernarg_address, false);
+      console_output (kernel, packet->kernarg_address, false);
     }
-  console_output (kernel, shadow->kernarg_address, true);
+  console_output (kernel, packet->kernarg_address, true);
 
   unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
 
@@ -3802,6 +3847,9 @@ GOMP_OFFLOAD_init_device (int n)
   GCN_DEBUG ("Selected device data memory region:\n");
   dump_hsa_region (agent->data_region, NULL);
 
+  /* Prepare kernargs cache.  */
+  init_alloc_cache (&agent->kernarg_cache);
+
   GCN_DEBUG ("GCN agent %d initialized\n", n);
 
   agent->initialized = true;
@@ -4219,6 +4267,17 @@ GOMP_OFFLOAD_fini_device (int n)
   if (status != HSA_STATUS_SUCCESS)
     return hsa_error ("Error destroying command queue", status);
 
+  /* Clean up kernargs cache.  */
+  struct alloc_cache_node *node = agent->kernarg_cache.head;
+  while (node)
+    {
+      hsa_fns.hsa_memory_free_fn (node->allocation);
+
+      struct alloc_cache_node *curr_node = node;
+      node = curr_node->next;
+      destroy_alloc_cache_node (curr_node);
+    }
+
   if (pthread_mutex_destroy (&agent->prog_mutex))
     {
       GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
diff --git a/libgomp/testsuite/libgomp.c/alloc_cache-1.c 
b/libgomp/testsuite/libgomp.c/alloc_cache-1.c
new file mode 100644
index 000000000000..b71368cba85c
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/alloc_cache-1.c
@@ -0,0 +1,62 @@
+/* { dg-do run } */
+/* Unit-test the alloc cache DS.  */
+#include <assert.h>
+#include <stdint.h>
+
+#include "alloc_cache.h"
+
+int
+main ()
+{
+  struct alloc_cache cache;
+
+  init_alloc_cache (&cache);
+
+  /* Empty cache.  Should return NULL.  */
+  assert (alloc_cache_try_find (&cache, 16) == NULL);
+  assert (alloc_cache_try_find (&cache, 0) == NULL);
+
+  /* Populating it a bit.  */
+  {
+    for (int i = 0; i < 5; i++)
+      {
+       uintptr_t x = 1 << i;
+       __auto_type n = alloc_cache_add_taken_node (&cache, (void *) x, x);
+       assert (n);
+       assert (n->allocation == (void *)x);
+       release_alloc_cache_node (n);
+      }
+  }
+
+  /* Taking five things, each of size 1, should return the whole cache.  */
+  {
+    struct alloc_cache_node *n[5];
+    uint32_t gotten_nodes = 0;
+    for (int i = 0; i < 5; i++)
+      {
+       __auto_type node = n[i] = alloc_cache_try_find (&cache, 1);
+       uintptr_t x = (uintptr_t) node->allocation;
+       gotten_nodes |= x;
+       assert (x == 1
+               || x == 2
+               || x == 4
+               || x == 8
+               || x == 16);
+      }
+    assert (gotten_nodes == 0b11111);
+
+    /* ... and the cache should remain empty.  */
+    assert (alloc_cache_try_find (&cache, 0) == NULL);
+
+    for (int i = 0; i < 5; i++)
+      release_alloc_cache_node (n[i]);
+  }
+
+  /* Taking 16 twice should fail the second time.  */
+  {
+    __auto_type n = alloc_cache_try_find (&cache, 16);
+    assert (n != NULL && ((uintptr_t) n->allocation) == 16);
+    assert (alloc_cache_try_find (&cache, 16) == NULL);
+    release_alloc_cache_node (n);
+  }
+}
-- 
2.53.0

Reply via email to