This patch ports team.c to nvptx by arranging an initialization/cleanup
routine, gomp_nvptx_main, that all (pre-started) threads can run. It
initializes a thread pool and proceeds to run gomp_thread_start in all threads
except thread zero, which runs original target region function.
Thread-private data is arranged via a linear array, nvptx_thrs, that is
allocated in gomp_nvptx_main.
Like in previous patch, are naked asm() statement OK?
* libgomp.h [__nvptx__] (gomp_thread): New implementation.
* config/nvptx/team.c: Delete.
* team.c: Guard uses of PThreads-specific interfaces by
LIBGOMP_USE_PTHREADS.
(gomp_nvptx_main): New.
(gomp_thread_start) [__nvptx__]: Handle calls from gomp_nvptx_main.
---
libgomp/config/nvptx/team.c | 0
libgomp/libgomp.h | 10 ++++-
libgomp/team.c | 92 ++++++++++++++++++++++++++++++++++++++++++---
3 files changed, 96 insertions(+), 6 deletions(-)
delete mode 100644 libgomp/config/nvptx/team.c
diff --git a/libgomp/config/nvptx/team.c b/libgomp/config/nvptx/team.c
deleted file mode 100644
index e69de29..0000000
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index 1454adf..f25b265 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -483,7 +483,15 @@ enum gomp_cancel_kind
/* ... and here is that TLS data. */
-#if defined HAVE_TLS || defined USE_EMUTLS
+#if defined __nvptx__
+extern struct gomp_thread *nvptx_thrs;
+static inline struct gomp_thread *gomp_thread (void)
+{
+ int tid;
+ asm ("mov.u32 %0, %%tid.y;" : "=r" (tid));
+ return nvptx_thrs + tid;
+}
+#elif defined HAVE_TLS || defined USE_EMUTLS
extern __thread struct gomp_thread gomp_tls_data;
static inline struct gomp_thread *gomp_thread (void)
{
diff --git a/libgomp/team.c b/libgomp/team.c
index 7671b05..5b74532 100644
--- a/libgomp/team.c
+++ b/libgomp/team.c
@@ -30,6 +30,7 @@
#include <stdlib.h>
#include <string.h>
+#ifdef LIBGOMP_USE_PTHREADS
/* This attribute contains PTHREAD_CREATE_DETACHED. */
pthread_attr_t gomp_thread_attr;
@@ -43,6 +44,7 @@ __thread struct gomp_thread gomp_tls_data;
#else
pthread_key_t gomp_tls_key;
#endif
+#endif
/* This structure is used to communicate across pthread_create. */
@@ -58,6 +60,52 @@ struct gomp_thread_start_data
bool nested;
};
+#ifdef __nvptx__
+struct gomp_thread *nvptx_thrs;
+
+static struct gomp_thread_pool *gomp_new_thread_pool (void);
+static void *gomp_thread_start (void *);
+
+void __attribute__((kernel))
+gomp_nvptx_main (void (*fn) (void *), void *fn_data)
+{
+ int ntids, tid, laneid;
+ asm ("mov.u32 %0, %%laneid;" : "=r" (laneid));
+ if (laneid)
+ return;
+ static struct gomp_thread_pool *pool;
+ asm ("mov.u32 %0, %%tid.y;" : "=r" (tid));
+ asm ("mov.u32 %0, %%ntid.y;" : "=r"(ntids));
+ if (tid == 0)
+ {
+ gomp_global_icv.nthreads_var = ntids;
+
+ nvptx_thrs = gomp_malloc_cleared (ntids * sizeof (*nvptx_thrs));
+
+ pool = gomp_new_thread_pool ();
+ pool->threads = gomp_malloc (ntids * sizeof (*pool->threads));
+ pool->threads[0] = nvptx_thrs;
+ pool->threads_size = ntids;
+ pool->threads_used = ntids;
+ gomp_barrier_init (&pool->threads_dock, ntids);
+
+ nvptx_thrs[0].thread_pool = pool;
+ asm ("bar.sync 0;");
+ fn (fn_data);
+
+ gomp_free_thread (nvptx_thrs);
+ free (nvptx_thrs);
+ }
+ else
+ {
+ struct gomp_thread_start_data tsdata = {0};
+ tsdata.ts.team_id = tid;
+ asm ("bar.sync 0;");
+ tsdata.thread_pool = pool;
+ gomp_thread_start (&tsdata);
+ }
+}
+#endif
/* This function is a pthread_create entry point. This contains the idle
loop in which a thread waits to be called up to become part of a team. */
@@ -71,7 +119,9 @@ gomp_thread_start (void *xdata)
void (*local_fn) (void *);
void *local_data;
-#if defined HAVE_TLS || defined USE_EMUTLS
+#ifdef __nvptx__
+ thr = gomp_thread ();
+#elif defined HAVE_TLS || defined USE_EMUTLS
thr = &gomp_tls_data;
#else
struct gomp_thread local_thr;
@@ -88,7 +138,8 @@ gomp_thread_start (void *xdata)
thr->task = data->task;
thr->place = data->place;
- thr->ts.team->ordered_release[thr->ts.team_id] = &thr->release;
+ if (thr->ts.team)
+ thr->ts.team->ordered_release[thr->ts.team_id] = &thr->release;
/* Make thread pool local. */
pool = thr->thread_pool;
@@ -110,6 +161,10 @@ gomp_thread_start (void *xdata)
pool->threads[thr->ts.team_id] = thr;
gomp_barrier_wait (&pool->threads_dock);
+#ifdef __nvptx__
+ local_fn = thr->fn;
+ local_data = thr->data;
+#endif
do
{
struct gomp_team *team = thr->ts.team;
@@ -242,7 +297,13 @@ gomp_free_pool_helper (void *thread_pool)
gomp_sem_destroy (&thr->release);
thr->thread_pool = NULL;
thr->task = NULL;
+#ifdef LIBGOMP_USE_PTHREADS
pthread_exit (NULL);
+#elif defined(__nvptx__)
+ asm ("exit;");
+#else
+#error gomp_free_pool_helper must terminate the thread
+#endif
}
/* Free a thread pool and release its threads. */
@@ -300,33 +361,40 @@ void
gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
unsigned flags, struct gomp_team *team)
{
- struct gomp_thread_start_data *start_data;
struct gomp_thread *thr, *nthr;
struct gomp_task *task;
struct gomp_task_icv *icv;
bool nested;
struct gomp_thread_pool *pool;
unsigned i, n, old_threads_used = 0;
- pthread_attr_t thread_attr, *attr;
unsigned long nthreads_var;
- char bind, bind_var;
+ char bind_var;
+#ifdef LIBGOMP_USE_PTHREADS
+ char bind;
+ struct gomp_thread_start_data *start_data;
+ pthread_attr_t thread_attr, *attr;
unsigned int s = 0, rest = 0, p = 0, k = 0;
+#endif
unsigned int affinity_count = 0;
struct gomp_thread **affinity_thr = NULL;
thr = gomp_thread ();
nested = thr->ts.team != NULL;
+#ifdef LIBGOMP_USE_PTHREADS
if (__builtin_expect (thr->thread_pool == NULL, 0))
{
thr->thread_pool = gomp_new_thread_pool ();
thr->thread_pool->threads_busy = nthreads;
pthread_setspecific (gomp_thread_destructor, thr);
}
+#endif
pool = thr->thread_pool;
task = thr->task;
icv = task ? &task->icv : &gomp_global_icv;
+#ifdef LIBGOMP_USE_PTHREADS
if (__builtin_expect (gomp_places_list != NULL, 0) && thr->place == 0)
gomp_init_affinity ();
+#endif
/* Always save the previous state, even if this isn't a nested team.
In particular, we should save any work share state from an outer
@@ -352,10 +420,12 @@ gomp_team_start (void (*fn) (void *), void *data,
unsigned nthreads,
bind_var = icv->bind_var;
if (bind_var != omp_proc_bind_false && (flags & 7) != omp_proc_bind_false)
bind_var = flags & 7;
+#ifdef LIBGOMP_USE_PTHREADS
bind = bind_var;
if (__builtin_expect (gomp_bind_var_list != NULL, 0)
&& thr->ts.level < gomp_bind_var_list_len)
bind_var = gomp_bind_var_list[thr->ts.level];
+#endif
gomp_init_task (thr->task, task, icv);
team->implicit_task[0].icv.nthreads_var = nthreads_var;
team->implicit_task[0].icv.bind_var = bind_var;
@@ -365,6 +435,7 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned
nthreads,
i = 1;
+#ifdef LIBGOMP_USE_PTHREADS
if (__builtin_expect (gomp_places_list != NULL, 0))
{
/* Depending on chosen proc_bind model, set subpartition
@@ -432,6 +503,7 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned
nthreads,
}
else
bind = omp_proc_bind_false;
+#endif
/* We only allow the reuse of idle threads for non-nested PARALLEL
regions. This appears to be implied by the semantics of
@@ -481,6 +553,7 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned
nthreads,
unsigned int place_partition_off = thr->ts.place_partition_off;
unsigned int place_partition_len = thr->ts.place_partition_len;
unsigned int place = 0;
+#ifdef LIBGOMP_USE_PTHREADS
if (__builtin_expect (gomp_places_list != NULL, 0))
{
switch (bind)
@@ -612,6 +685,7 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned
nthreads,
place = p + 1;
}
else
+#endif
nthr = pool->threads[i];
nthr->ts.team = team;
nthr->ts.work_share = &team->work_shares[0];
@@ -635,6 +709,7 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned
nthreads,
team->ordered_release[i] = &nthr->release;
}
+#ifdef LIBGOMP_USE_PTHREADS
if (__builtin_expect (affinity_thr != NULL, 0))
{
/* If AFFINITY_THR is non-NULL just because we had to
@@ -695,9 +770,11 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned
nthreads,
if (i == nthreads)
goto do_release;
+#endif
}
+#ifdef LIBGOMP_USE_PTHREADS
if (__builtin_expect (nthreads + affinity_count > old_threads_used, 0))
{
long diff = (long) (nthreads + affinity_count) - (long) old_threads_used;
@@ -829,6 +906,7 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned
nthreads,
pthread_attr_destroy (&thread_attr);
do_release:
+#endif
gomp_barrier_wait (nested ? &team->barrier : &pool->threads_dock);
/* Decrease the barrier threshold to match the number of threads
@@ -935,6 +1013,7 @@ gomp_team_end (void)
}
}
+#ifdef LIBGOMP_USE_PTHREADS
/* Constructors for this file. */
@@ -959,6 +1038,7 @@ team_destructor (void)
crashes. */
pthread_key_delete (gomp_thread_destructor);
}
+#endif
struct gomp_task_icv *
gomp_new_icv (void)
@@ -967,6 +1047,8 @@ gomp_new_icv (void)
struct gomp_task *task = gomp_malloc (sizeof (struct gomp_task));
gomp_init_task (task, NULL, &gomp_global_icv);
thr->task = task;
+#ifdef LIBGOMP_USE_PTHREADS
pthread_setspecific (gomp_thread_destructor, thr);
+#endif
return &task->icv;
}