https://gcc.gnu.org/g:c04011b3eac8475debd8c0add7a881becb598377

commit r17-1067-gc04011b3eac8475debd8c0add7a881becb598377
Author: Arsen Arsenović <[email protected]>
Date:   Thu Feb 12 15:42:02 2026 +0000

    libgomp/gcn: parallelize initializing threads of a team
    
    Currently, libgomp performs initialization of all threads in a team
    in its lead thread, and then releases all threads to do work.  This
    means that, before reaching the release, each thread is doing nothing,
    waiting for the lead threads to do lots of thread initialization
    operations.
    
    This initialization is identical for each thread.
    
    We can parallelize it by performing this initialization in each thread,
    after releasing each.  This allows the threads of a team to be released
    near-immediately, which should cut team startup time roughly by just
    under the number of threads.
    
    In order to achieve this, the lead thread prepares the parameters each
    thread needs for initialization by copying them into an object each will
    be able to read from, and only initializes each remaining thread in the
    team with a few pointers.
    
    No functional changes intended in this commit.  It may seem like there
    is a functional change, as gomp_prep_our_thread no longer sets
    icv.nthreads_var, whereas the old code did, but the value that was being
    set by old code was always equal to the value already present in the
    ICV, because both are initialized from parent tasks ICV (or global ICV
    if that's missing) and, hence, the write was always redundant.
    
    libgomp/ChangeLog:
    
            * libgomp.h (struct gomp_thread_start_data): New struct.  Holds
            thread-independent parameters needed to initialize current
            thread.
            (struct gomp_team): On GCN, add thr_start_data field, that holds
            a gomp_thread_start_data to be used in each thread.
            (struct gomp_thread): Add start_data field, that points to
            thread initialization parameters.
            * config/gcn/team.c (gomp_team_start): Move thread
            initialization steps into ...
            (gomp_prep_our_thread): this new function, such that it reads
            from a gomp_thread_start_data object.
            (gomp_thread_start): Call the above to initialize our thread.

Diff:
---
 libgomp/config/gcn/team.c | 121 +++++++++++++++++++++++++++++++---------------
 libgomp/libgomp.h         |  31 ++++++++++++
 2 files changed, 112 insertions(+), 40 deletions(-)

diff --git a/libgomp/config/gcn/team.c b/libgomp/config/gcn/team.c
index c9c2f3c24191..1ca4c6b1266a 100644
--- a/libgomp/config/gcn/team.c
+++ b/libgomp/config/gcn/team.c
@@ -24,6 +24,7 @@
    <http://www.gnu.org/licenses/>.  */
 
 /* This file handles maintenance of threads on AMD GCN.  */
+#include <assert.h>
 
 #include "libgomp.h"
 #include <stdlib.h>
@@ -132,6 +133,33 @@ gomp_gcn_exit_kernel (void)
   team_free (gcn_thrs ());
 }
 
+/* Populate THR from START_DATA.  Assumes THR is current thread.  Argument is
+   broken out to avoid repeated calls to gomp_thread, which may be
+   expensive.  */
+
+static inline void
+gomp_prep_our_thread (struct gomp_thread *thr,
+                     struct gomp_thread_start_data *start_data,
+                     int threadid)
+{
+  thr->ts.team = start_data->team;
+  thr->ts.work_share = &start_data->team->work_shares[0];
+  thr->ts.last_work_share = NULL;
+  thr->ts.team_id = threadid;
+  thr->ts.level = start_data->level;
+  thr->ts.active_level = start_data->active_level;
+  thr->ts.single_count = 0;
+  thr->ts.static_trip = 0;
+  thr->task = &start_data->team->implicit_task[threadid];
+  gomp_init_task (thr->task, start_data->parent_task, &start_data->prev_icvs);
+  /* TODO(arsen): This should be part of a mechanism that allows us to override
+     nthreads-var with OMP_NUM_THREADS.  But, we currently don't have access to
+     that list on the device.
+
+     thr->task->icv.nthreads_var = ...;  */
+  thr->task->taskgroup = start_data->taskgroup;
+}
+
 /* This function contains the idle loop in which a thread waits
    to be called up to become part of a team.  */
 
@@ -162,6 +190,19 @@ gomp_thread_start (struct gomp_thread_pool *pool)
              abort();
            }
        }
+
+      /* Perform rest of task initialization.  Populated from
+        gomp_team_start.  */
+      if (thr->start_data)
+       /* If !start_data, we're probably executing cleanup helpers, so we
+          don't really care about initializing these fields.  */
+       {
+         /* On threads other than the main thread, the thread ID within a
+            team is always equal to dim_pos(1).  */
+         gomp_prep_our_thread (thr, thr->start_data, __builtin_gcn_dim_pos 
(1));
+         thr->start_data = NULL;
+       }
+
       thr->fn (thr->data);
       thr->fn = NULL;
 
@@ -180,61 +221,61 @@ gomp_team_start (void (*fn) (void *), void *data, 
unsigned nthreads,
                 struct gomp_taskgroup *taskgroup)
 {
   struct gomp_thread *thr, *nthr;
-  struct gomp_task *task;
+  struct gomp_task *prev_task;
   struct gomp_task_icv *icv;
   struct gomp_thread_pool *pool;
-  unsigned long nthreads_var;
 
   thr = gomp_thread ();
   pool = thr->thread_pool;
-  task = thr->task;
-  icv = task ? &task->icv : &gomp_global_icv;
+  prev_task = thr->task;
+  icv = prev_task ? &prev_task->icv : &gomp_global_icv;
 
   /* Always save the previous state, even if this isn't a nested team.
      In particular, we should save any work share state from an outer
      orphaned work share construct.  */
   team->prev_ts = thr->ts;
 
-  thr->ts.team = team;
-  thr->ts.team_id = 0;
-  ++thr->ts.level;
-  if (nthreads > 1)
-    ++thr->ts.active_level;
-  thr->ts.work_share = &team->work_shares[0];
-  thr->ts.last_work_share = NULL;
-  thr->ts.single_count = 0;
-  thr->ts.static_trip = 0;
-  thr->task = &team->implicit_task[0];
-  nthreads_var = icv->nthreads_var;
-  gomp_init_task (thr->task, task, icv);
-  team->implicit_task[0].icv.nthreads_var = nthreads_var;
-  team->implicit_task[0].taskgroup = taskgroup;
+  /* Populate start data.  */
+  team->thr_start_data = (struct gomp_thread_start_data) {
+    .team = team,
+    .level = thr->ts.level + 1,
+    .active_level = thr->ts.active_level + (nthreads > 1),
+    .parent_task = thr->task,
+    .prev_icvs = *icv,
+    .taskgroup = taskgroup
+  };
+
+  if (nthreads != 1)
+    {
+      /* When there's more than one thread, we expect that we're operating on
+        thread w/ dim_pos(1) == 0, and that each of the other initialized
+        threads will operate with team_id == dim_pos(1).  */
+      assert (__builtin_gcn_dim_pos (1) == 0);
+      /* We only expect one team to have more than one active thread.  See
+        accelerator-specific logic in gomp_resolve_num_threads.  */
+      assert (!thr->ts.active_level);
+
+      /* Prepare other threads waiting on our barrier.  Besides fn, data,
+        taskgroup, all the fields of those threads are initialized based on
+        the values initialized in our thread above (which is always the master
+        thread).  */
+      for (unsigned i = 1; i < nthreads; ++i)
+       {
+         nthr = pool->threads[i];
 
-  if (nthreads == 1)
-    return;
+         nthr->start_data = &team->thr_start_data;
+         nthr->fn = fn;
+         nthr->data = data;
+         team->ordered_release[i] = &nthr->release;
+       }
 
-  /* Release existing idle threads.  */
-  for (unsigned i = 1; i < nthreads; ++i)
-    {
-      nthr = pool->threads[i];
-      nthr->ts.team = team;
-      nthr->ts.work_share = &team->work_shares[0];
-      nthr->ts.last_work_share = NULL;
-      nthr->ts.team_id = i;
-      nthr->ts.level = team->prev_ts.level + 1;
-      nthr->ts.active_level = thr->ts.active_level;
-      nthr->ts.single_count = 0;
-      nthr->ts.static_trip = 0;
-      nthr->task = &team->implicit_task[i];
-      gomp_init_task (nthr->task, task, icv);
-      team->implicit_task[i].icv.nthreads_var = nthreads_var;
-      team->implicit_task[i].taskgroup = taskgroup;
-      nthr->fn = fn;
-      nthr->data = data;
-      team->ordered_release[i] = &nthr->release;
+      /* Release the other threads.  */
+      gomp_simple_barrier_wait (&pool->threads_dock);
     }
 
-  gomp_simple_barrier_wait (&pool->threads_dock);
+  /* Finish initializing our thread.  The thread ID in the team of the caller
+     is always zero, even if __builtin_gcn_dim_pos (1) != 0.  */
+  gomp_prep_our_thread (thr, &team->thr_start_data, 0);
 }
 
 #include "../../team.c"
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index 41850d0f77de..4b31564a6b24 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -775,6 +775,27 @@ struct gomp_target_task
   void *hostaddrs[];
 };
 
+#ifdef __AMDGCN__
+/* Parameters needed to kick off new threads on AMD GCN.  They correspond to
+   various fields in gomp_thread.  This struct, and all its contents, should
+   only be modified by gomp_team_start, and stay untouched until the threads
+   of a team reach the final barrier.  */
+
+struct gomp_thread_start_data
+{
+  /* Team the new thread is part of.  */
+  struct gomp_team *team;
+  /* Active nesting level.  */
+  unsigned level, active_level;
+  /* Parent task.  */
+  struct gomp_task *parent_task;
+  /* Previous ICVs.  */
+  struct gomp_task_icv prev_icvs;
+  /* Task group for the new threads implicit task.  */
+  struct gomp_taskgroup *taskgroup;
+};
+#endif
+
 /* This structure describes a "team" of threads.  These are the threads
    that are spawned by a PARALLEL constructs, as well as the work sharing
    constructs that the team encounters.  */
@@ -857,6 +878,11 @@ struct gomp_team
   /* Number of tasks waiting for their completion event to be fulfilled.  */
   unsigned int task_detach_count;
 
+#ifdef __AMDGCN__
+  /* Used on AMD GCN to inform threads how to launch in a team.  */
+  struct gomp_thread_start_data thr_start_data;
+#endif
+
   /* This array contains structures for implicit tasks.  */
   struct gomp_task implicit_task[];
 };
@@ -870,6 +896,11 @@ struct gomp_thread
   void (*fn) (void *data);
   void *data;
 
+#ifdef __AMDGCN__
+  /* And these are the parameters it should set.  */
+  struct gomp_thread_start_data *start_data;
+#endif
+
   /* This is the current team state for this thread.  The ts.team member
      is NULL only if the thread is idle.  */
   struct gomp_team_state ts;

Reply via email to