There are two kinds of pause bottleneck, one is in user space, the other
is in kernel. Tpause plus backoff could reduce loop count in user space.
To kernel, Because tasks start at same time, they usually arrive critial
area at same time, this decrease performance. tasks started one by one
could avoid it.
include/ChangeLog:
* localfn.h: define RUNLOCALFN.
libgomp/ChangeLog:
* config/linux/wait.h: split do_spin
* env.c (initialize_env): set gomp_thread_delay_count default
value
* libgomp.h: add gomp_thread_delay_count
* team.c (gomp_thread_start): add RUNLOCALFN
* config/linux/spin.h: head file.
* config/linux/x86/localfn.h: implement thread delay.
* config/linux/x86/mutex.c: implement tpause backoff.
* config/linux/x86/spin.h: spin head file.
---
include/localfn.h | 6 +++
libgomp/config/linux/spin.h| 12 ++
libgomp/config/linux/wait.h| 11 ++---
libgomp/config/linux/x86/localfn.h | 19 +
libgomp/config/linux/x86/mutex.c | 66 ++
libgomp/config/linux/x86/spin.h| 5 +++
libgomp/env.c | 4 ++
libgomp/libgomp.h | 1 +
libgomp/team.c | 8 ++--
9 files changed, 121 insertions(+), 11 deletions(-)
create mode 100644 include/localfn.h
create mode 100644 libgomp/config/linux/spin.h
create mode 100644 libgomp/config/linux/x86/localfn.h
create mode 100644 libgomp/config/linux/x86/mutex.c
create mode 100644 libgomp/config/linux/x86/spin.h
diff --git a/include/localfn.h b/include/localfn.h
new file mode 100644
index 000..998e6554aec
--- /dev/null
+++ b/include/localfn.h
@@ -0,0 +1,6 @@
+#define RUNLOCALFN(a, b, c) \
+ do \
+{ \
+ a (b); \
+} \
+ while (0)
diff --git a/libgomp/config/linux/spin.h b/libgomp/config/linux/spin.h
new file mode 100644
index 000..ad8eba275ed
--- /dev/null
+++ b/libgomp/config/linux/spin.h
@@ -0,0 +1,12 @@
+static inline int
+do_spin_for_count (int *addr, int val, unsigned long long count)
+{
+ unsigned long long i;
+ for (i = 0; i < count; i++)
+if (__builtin_expect (__atomic_load_n (addr, MEMMODEL_RELAXED) != val, 0))
+ return 0;
+else
+ cpu_relax ();
+ return 1;
+}
+
diff --git a/libgomp/config/linux/wait.h b/libgomp/config/linux/wait.h
index 29d745f7141..17b7ef11c96 100644
--- a/libgomp/config/linux/wait.h
+++ b/libgomp/config/linux/wait.h
@@ -44,21 +44,16 @@
extern int gomp_futex_wait, gomp_futex_wake;
#include
-
+#include
static inline int do_spin (int *addr, int val)
{
- unsigned long long i, count = gomp_spin_count_var;
+ unsigned long long count = gomp_spin_count_var;
if (__builtin_expect (__atomic_load_n (_managed_threads,
MEMMODEL_RELAXED)
> gomp_available_cpus, 0))
count = gomp_throttled_spin_count_var;
- for (i = 0; i < count; i++)
-if (__builtin_expect (__atomic_load_n (addr, MEMMODEL_RELAXED) != val, 0))
- return 0;
-else
- cpu_relax ();
- return 1;
+ return do_spin_for_count (addr, val, count);
}
static inline void do_wait (int *addr, int val)
diff --git a/libgomp/config/linux/x86/localfn.h
b/libgomp/config/linux/x86/localfn.h
new file mode 100644
index 000..379aced99ee
--- /dev/null
+++ b/libgomp/config/linux/x86/localfn.h
@@ -0,0 +1,19 @@
+#ifdef __x86_64__
+static inline void
+gomp_thread_delay(unsigned int count)
+{
+ unsigned long long i;
+ for (i = 0; i < count * gomp_thread_delay_count; i++)
+__builtin_ia32_pause ();
+}
+
+#define RUNLOCALFN(a, b, c) \
+ do \
+{ \
+ gomp_thread_delay(c); \
+ a (b); \
+} \
+ while (0)
+#else
+# include "../../../../include/localfn.h"
+#endif
diff --git a/libgomp/config/linux/x86/mutex.c b/libgomp/config/linux/x86/mutex.c
new file mode 100644
index 000..5a14efb522e
--- /dev/null
+++ b/libgomp/config/linux/x86/mutex.c
@@ -0,0 +1,66 @@
+#include "../mutex.c"
+
+#ifdef __x86_64__
+static inline int
+do_spin_for_count_generic (int *addr, int val, unsigned long long count)
+{
+ unsigned long long i;
+ for (i = 0; i < count; i++)
+if (__builtin_expect (__atomic_load_n (addr, MEMMODEL_RELAXED) != val,
+ 0))
+ return 0;
+else
+ cpu_relax ();
+ return 1;
+}
+
+#ifndef __WAITPKG__
+#pragma GCC push_options
+#pragma GCC target("waitpkg")
+#define __DISABLE_WAITPKG__
+#endif /* __WAITPKG__ */
+
+static inline unsigned long long __rdtsc(void)
+{
+ unsigned long long var;
+ unsigned int hi, lo;
+
+ __asm volatile ("rdtsc" : "=a" (lo), "=d" (hi));
+
+ var = ((unsigned long long)hi << 32) | lo;
+ return var;
+}
+
+#define PAUSE_TP 200
+static inline int
+do_spin_for_backoff_tpause (int *addr, int val, unsigned long long count)
+{
+ unsigned int ctrl = 1;
+ unsigned long long wait_time = 1;
+ unsigned long long mask = 1ULL << __builtin_ia32_bsrdi(count * PAUSE_TP);
+ do