Currently, libgomp performs initialization of all threads in a team
in its lead thread, and then releases all threads to do work.  This
means that, before reaching the release, each thread is doing nothing,
waiting for the lead threads to do lots of thread initialization
operations.

This initialization is identical for each thread.

We can parallelize it by performing this initialization in each thread,
after releasing each.  This allows the threads of a team to be released
near-immediately, which should cut team startup time roughly by just
under the number of threads.

In order to achieve this, the lead thread prepares the parameters each
thread needs for initialization by copying them into an object each will
be able to read from, and only initializes each remaining thread in the
team with a few pointers.

No functional changes intended in this commit.

libgomp/ChangeLog:

        * libgomp.h (struct gomp_thread_start_data): New struct.  Holds
        thread-independent parameters needed to initialize current
        thread.
        (struct gomp_team): On GCN, add thr_start_data field, that holds
        a gomp_thread_start_data to be used in each thread.
        (struct gomp_thread): Add start_data field, that points to
        thread initialization parameters.
        * config/gcn/team.c (gomp_team_start): Move thread
        initialization steps into ...
        (gomp_prep_our_thread): this new function, such that it reads
        from a gomp_thread_start_data object.
        (gomp_thread_start): Call the above to initialize our thread.
---
 libgomp/config/gcn/team.c | 121 +++++++++++++++++++++++++-------------
 libgomp/libgomp.h         |  31 ++++++++++
 2 files changed, 112 insertions(+), 40 deletions(-)

diff --git a/libgomp/config/gcn/team.c b/libgomp/config/gcn/team.c
index c9c2f3c24191..1ca4c6b1266a 100644
--- a/libgomp/config/gcn/team.c
+++ b/libgomp/config/gcn/team.c
@@ -24,6 +24,7 @@
    <http://www.gnu.org/licenses/>.  */
 
 /* This file handles maintenance of threads on AMD GCN.  */
+#include <assert.h>
 
 #include "libgomp.h"
 #include <stdlib.h>
@@ -132,6 +133,33 @@ gomp_gcn_exit_kernel (void)
   team_free (gcn_thrs ());
 }
 
+/* Populate THR from START_DATA.  Assumes THR is current thread.  Argument is
+   broken out to avoid repeated calls to gomp_thread, which may be
+   expensive.  */
+
+static inline void
+gomp_prep_our_thread (struct gomp_thread *thr,
+                     struct gomp_thread_start_data *start_data,
+                     int threadid)
+{
+  thr->ts.team = start_data->team;
+  thr->ts.work_share = &start_data->team->work_shares[0];
+  thr->ts.last_work_share = NULL;
+  thr->ts.team_id = threadid;
+  thr->ts.level = start_data->level;
+  thr->ts.active_level = start_data->active_level;
+  thr->ts.single_count = 0;
+  thr->ts.static_trip = 0;
+  thr->task = &start_data->team->implicit_task[threadid];
+  gomp_init_task (thr->task, start_data->parent_task, &start_data->prev_icvs);
+  /* TODO(arsen): This should be part of a mechanism that allows us to override
+     nthreads-var with OMP_NUM_THREADS.  But, we currently don't have access to
+     that list on the device.
+
+     thr->task->icv.nthreads_var = ...;  */
+  thr->task->taskgroup = start_data->taskgroup;
+}
+
 /* This function contains the idle loop in which a thread waits
    to be called up to become part of a team.  */
 
@@ -162,6 +190,19 @@ gomp_thread_start (struct gomp_thread_pool *pool)
              abort();
            }
        }
+
+      /* Perform rest of task initialization.  Populated from
+        gomp_team_start.  */
+      if (thr->start_data)
+       /* If !start_data, we're probably executing cleanup helpers, so we
+          don't really care about initializing these fields.  */
+       {
+         /* On threads other than the main thread, the thread ID within a
+            team is always equal to dim_pos(1).  */
+         gomp_prep_our_thread (thr, thr->start_data, __builtin_gcn_dim_pos 
(1));
+         thr->start_data = NULL;
+       }
+
       thr->fn (thr->data);
       thr->fn = NULL;
 
@@ -180,61 +221,61 @@ gomp_team_start (void (*fn) (void *), void *data, 
unsigned nthreads,
                 struct gomp_taskgroup *taskgroup)
 {
   struct gomp_thread *thr, *nthr;
-  struct gomp_task *task;
+  struct gomp_task *prev_task;
   struct gomp_task_icv *icv;
   struct gomp_thread_pool *pool;
-  unsigned long nthreads_var;
 
   thr = gomp_thread ();
   pool = thr->thread_pool;
-  task = thr->task;
-  icv = task ? &task->icv : &gomp_global_icv;
+  prev_task = thr->task;
+  icv = prev_task ? &prev_task->icv : &gomp_global_icv;
 
   /* Always save the previous state, even if this isn't a nested team.
      In particular, we should save any work share state from an outer
      orphaned work share construct.  */
   team->prev_ts = thr->ts;
 
-  thr->ts.team = team;
-  thr->ts.team_id = 0;
-  ++thr->ts.level;
-  if (nthreads > 1)
-    ++thr->ts.active_level;
-  thr->ts.work_share = &team->work_shares[0];
-  thr->ts.last_work_share = NULL;
-  thr->ts.single_count = 0;
-  thr->ts.static_trip = 0;
-  thr->task = &team->implicit_task[0];
-  nthreads_var = icv->nthreads_var;
-  gomp_init_task (thr->task, task, icv);
-  team->implicit_task[0].icv.nthreads_var = nthreads_var;
-  team->implicit_task[0].taskgroup = taskgroup;
+  /* Populate start data.  */
+  team->thr_start_data = (struct gomp_thread_start_data) {
+    .team = team,
+    .level = thr->ts.level + 1,
+    .active_level = thr->ts.active_level + (nthreads > 1),
+    .parent_task = thr->task,
+    .prev_icvs = *icv,
+    .taskgroup = taskgroup
+  };
 
-  if (nthreads == 1)
-    return;
-
-  /* Release existing idle threads.  */
-  for (unsigned i = 1; i < nthreads; ++i)
+  if (nthreads != 1)
     {
-      nthr = pool->threads[i];
-      nthr->ts.team = team;
-      nthr->ts.work_share = &team->work_shares[0];
-      nthr->ts.last_work_share = NULL;
-      nthr->ts.team_id = i;
-      nthr->ts.level = team->prev_ts.level + 1;
-      nthr->ts.active_level = thr->ts.active_level;
-      nthr->ts.single_count = 0;
-      nthr->ts.static_trip = 0;
-      nthr->task = &team->implicit_task[i];
-      gomp_init_task (nthr->task, task, icv);
-      team->implicit_task[i].icv.nthreads_var = nthreads_var;
-      team->implicit_task[i].taskgroup = taskgroup;
-      nthr->fn = fn;
-      nthr->data = data;
-      team->ordered_release[i] = &nthr->release;
+      /* When there's more than one thread, we expect that we're operating on
+        thread w/ dim_pos(1) == 0, and that each of the other initialized
+        threads will operate with team_id == dim_pos(1).  */
+      assert (__builtin_gcn_dim_pos (1) == 0);
+      /* We only expect one team to have more than one active thread.  See
+        accelerator-specific logic in gomp_resolve_num_threads.  */
+      assert (!thr->ts.active_level);
+
+      /* Prepare other threads waiting on our barrier.  Besides fn, data,
+        taskgroup, all the fields of those threads are initialized based on
+        the values initialized in our thread above (which is always the master
+        thread).  */
+      for (unsigned i = 1; i < nthreads; ++i)
+       {
+         nthr = pool->threads[i];
+
+         nthr->start_data = &team->thr_start_data;
+         nthr->fn = fn;
+         nthr->data = data;
+         team->ordered_release[i] = &nthr->release;
+       }
+
+      /* Release the other threads.  */
+      gomp_simple_barrier_wait (&pool->threads_dock);
     }
 
-  gomp_simple_barrier_wait (&pool->threads_dock);
+  /* Finish initializing our thread.  The thread ID in the team of the caller
+     is always zero, even if __builtin_gcn_dim_pos (1) != 0.  */
+  gomp_prep_our_thread (thr, &team->thr_start_data, 0);
 }
 
 #include "../../team.c"
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index 42f324392957..c51bd680713f 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -775,6 +775,27 @@ struct gomp_target_task
   void *hostaddrs[];
 };
 
+#ifdef __AMDGCN__
+/* Parameters needed to kick off new threads on AMD GCN.  They correspond to
+   various fields in gomp_thread.  This struct, and all its contents, should
+   only be modified by gomp_team_start, and stay untouched until the threads
+   of a team reach the final barrier.  */
+
+struct gomp_thread_start_data
+{
+  /* Team the new thread is part of.  */
+  struct gomp_team *team;
+  /* Active nesting level.  */
+  unsigned level, active_level;
+  /* Parent task.  */
+  struct gomp_task *parent_task;
+  /* Previous ICVs.  */
+  struct gomp_task_icv prev_icvs;
+  /* Task group for the new threads implicit task.  */
+  struct gomp_taskgroup *taskgroup;
+};
+#endif
+
 /* This structure describes a "team" of threads.  These are the threads
    that are spawned by a PARALLEL constructs, as well as the work sharing
    constructs that the team encounters.  */
@@ -857,6 +878,11 @@ struct gomp_team
   /* Number of tasks waiting for their completion event to be fulfilled.  */
   unsigned int task_detach_count;
 
+#ifdef __AMDGCN__
+  /* Used on AMD GCN to inform threads how to launch in a team.  */
+  struct gomp_thread_start_data thr_start_data;
+#endif
+
   /* This array contains structures for implicit tasks.  */
   struct gomp_task implicit_task[];
 };
@@ -870,6 +896,11 @@ struct gomp_thread
   void (*fn) (void *data);
   void *data;
 
+#ifdef __AMDGCN__
+  /* And these are the parameters it should set.  */
+  struct gomp_thread_start_data *start_data;
+#endif
+
   /* This is the current team state for this thread.  The ts.team member
      is NULL only if the thread is idle.  */
   struct gomp_team_state ts;
-- 
2.54.0

Reply via email to