RE: [PATCH] x86: set spincount 1 for x86 hybrid platform [PR109812]

2023-10-10 Thread Zhang, Jun
> -Original Message-
> From: Jakub Jelinek 
> Sent: Tuesday, October 10, 2023 3:57 PM
> To: Zhang, Jun 
> Cc: gcc-patches@gcc.gnu.org; ubiz...@gmail.com; Liu, Hongtao
> 
> Subject: Re: [PATCH] x86: set spincount 1 for x86 hybrid platform [PR109812]
> 
> On Tue, Oct 10, 2023 at 12:59:52PM +0800, Jun Zhang wrote:
> > include/ChangeLog:
> >
> > * omphook.h: define RUNOMPHOOK macro.
> 
> ChangeLog formatting.  The description should start with capital letter.  If 
> you add
> a new file, you should just mention : New file. or something similar.
> 
> But much more importantly, we don't do hooks like that in libgomp, it should 
> be
> better a static inline function where the name shows what it is doing (e.g.
> do_adjust_default_spincount), best in some already existing header (e.g. 
> wait.h),

Sorry, I don't know how to place wait.h. This patch is only for x86.
Wait.h is in libgomp/config/linux/.
if I directly place do_adjust_default_spincount in wait.h,
it maybe need #ifdef  __x86_64__, and also need a wait.h in include/
for other os platform.
if I create a new wait.h in libgomp/config/linux/x86/,
It need copy whole wait.h from libgomp/config/linux/, 
also need a wait.h in include/ for other os platform.
Could you help me?

> and best at the end of the if (!parse_spincount ("GOMP_SPINCOUNT",
> _spin_count_var)) block in env.c.
> >
> > libgomp/ChangeLog:
> >
> > * env.c (initialize_env): add RUNOMPHOOK macro.
> > * config/linux/x86/omphook.h: define RUNOMPHOOK macro.
> > ---
> >  include/omphook.h  |  1 +
> >  libgomp/config/linux/x86/omphook.h | 19 +++
> >  libgomp/env.c  |  3 +++
> >  3 files changed, 23 insertions(+)
> >  create mode 100644 include/omphook.h
> >  create mode 100644 libgomp/config/linux/x86/omphook.h
> >
> > diff --git a/include/omphook.h b/include/omphook.h new file mode
> > 100644 index 000..2ebe3ad57e6
> > --- /dev/null
> > +++ b/include/omphook.h
> > @@ -0,0 +1 @@
> > +#define RUNOMPHOOK()
> > diff --git a/libgomp/config/linux/x86/omphook.h
> > b/libgomp/config/linux/x86/omphook.h
> > new file mode 100644
> > index 000..aefb311cc07
> > --- /dev/null
> > +++ b/libgomp/config/linux/x86/omphook.h
> > @@ -0,0 +1,19 @@
> > +#ifdef __x86_64__
> > +#include "cpuid.h"
> > +
> > +/* only for x86 hybrid platform */
> 
> Comments should again start with capital letter and end with dot and 2 spaces
> before */
> 
> > +#define RUNOMPHOOK()  \
> > +  do \
> > +{ \
> > +  unsigned int eax, ebx, ecx, edx; \
> > +  if ((getenv ("GOMP_SPINCOUNT") == NULL) && (wait_policy < 0) \
> 
> Spurious ()s around the subcondition, the first shouldn't be tested when 
> placed
> right, if condition doesn't fit on one line, each subcondition should be on 
> separate
> line.
> 
> > + && __get_cpuid_count (7, 0, , , , ) \
> 
> If we don't have a macro for the CPUID.07H.0H:EDX[15] bit, there should be a
> comment which says what that bit is.
> 
> > + && ((edx >> 15) & 1)) \
> > +   gomp_spin_count_var = 1LL; \
> > +  if (gomp_throttled_spin_count_var > gomp_spin_count_var) \
> > +   gomp_throttled_spin_count_var = gomp_spin_count_var; \
> 
> The above 2 lines won't be needed if placed right.
> 
>   Jakub



[PATCH] Enable tpause Exponential backoff and thread delay

2023-08-02 Thread Zhang, Jun via Gcc-patches
There are two kinds of pause bottleneck, one is in user space, the other
is in kernel. Tpause plus backoff could reduce loop count in user space.
To kernel, Because tasks start at same time, they usually arrive critial
area at same time, this decrease performance. tasks started one by one
could avoid it.

include/ChangeLog:

* localfn.h: define RUNLOCALFN.

libgomp/ChangeLog:

* config/linux/wait.h: split do_spin
* env.c (initialize_env): set gomp_thread_delay_count default
value
* libgomp.h: add gomp_thread_delay_count
* team.c (gomp_thread_start): add RUNLOCALFN
* config/linux/spin.h: head file.
* config/linux/x86/localfn.h: implement thread delay.
* config/linux/x86/mutex.c: implement tpause backoff.
* config/linux/x86/spin.h: spin head file.
---
 include/localfn.h  |  6 +++
 libgomp/config/linux/spin.h| 12 ++
 libgomp/config/linux/wait.h| 11 ++---
 libgomp/config/linux/x86/localfn.h | 19 +
 libgomp/config/linux/x86/mutex.c   | 66 ++
 libgomp/config/linux/x86/spin.h|  5 +++
 libgomp/env.c  |  4 ++
 libgomp/libgomp.h  |  1 +
 libgomp/team.c |  8 ++--
 9 files changed, 121 insertions(+), 11 deletions(-)
 create mode 100644 include/localfn.h
 create mode 100644 libgomp/config/linux/spin.h
 create mode 100644 libgomp/config/linux/x86/localfn.h
 create mode 100644 libgomp/config/linux/x86/mutex.c
 create mode 100644 libgomp/config/linux/x86/spin.h

diff --git a/include/localfn.h b/include/localfn.h
new file mode 100644
index 000..998e6554aec
--- /dev/null
+++ b/include/localfn.h
@@ -0,0 +1,6 @@
+#define RUNLOCALFN(a, b, c)  \
+  do \
+{ \
+  a (b); \
+} \
+  while (0)
diff --git a/libgomp/config/linux/spin.h b/libgomp/config/linux/spin.h
new file mode 100644
index 000..ad8eba275ed
--- /dev/null
+++ b/libgomp/config/linux/spin.h
@@ -0,0 +1,12 @@
+static inline int
+do_spin_for_count (int *addr, int val, unsigned long long count)
+{
+  unsigned long long i;
+  for (i = 0; i < count; i++)
+if (__builtin_expect (__atomic_load_n (addr, MEMMODEL_RELAXED) != val, 0))
+  return 0;
+else
+  cpu_relax ();
+  return 1;
+}
+
diff --git a/libgomp/config/linux/wait.h b/libgomp/config/linux/wait.h
index 29d745f7141..17b7ef11c96 100644
--- a/libgomp/config/linux/wait.h
+++ b/libgomp/config/linux/wait.h
@@ -44,21 +44,16 @@
 extern int gomp_futex_wait, gomp_futex_wake;
 
 #include 
-
+#include 
 static inline int do_spin (int *addr, int val)
 {
-  unsigned long long i, count = gomp_spin_count_var;
+  unsigned long long count = gomp_spin_count_var;
 
   if (__builtin_expect (__atomic_load_n (_managed_threads,
  MEMMODEL_RELAXED)
 > gomp_available_cpus, 0))
 count = gomp_throttled_spin_count_var;
-  for (i = 0; i < count; i++)
-if (__builtin_expect (__atomic_load_n (addr, MEMMODEL_RELAXED) != val, 0))
-  return 0;
-else
-  cpu_relax ();
-  return 1;
+  return do_spin_for_count (addr, val, count);
 }
 
 static inline void do_wait (int *addr, int val)
diff --git a/libgomp/config/linux/x86/localfn.h 
b/libgomp/config/linux/x86/localfn.h
new file mode 100644
index 000..379aced99ee
--- /dev/null
+++ b/libgomp/config/linux/x86/localfn.h
@@ -0,0 +1,19 @@
+#ifdef __x86_64__
+static inline void
+gomp_thread_delay(unsigned int count)
+{
+  unsigned long long i;
+  for (i = 0; i < count * gomp_thread_delay_count; i++)
+__builtin_ia32_pause ();
+}
+
+#define RUNLOCALFN(a, b, c)  \
+  do \
+{ \
+  gomp_thread_delay(c); \
+  a (b); \
+} \
+  while (0)
+#else
+# include "../../../../include/localfn.h"
+#endif
diff --git a/libgomp/config/linux/x86/mutex.c b/libgomp/config/linux/x86/mutex.c
new file mode 100644
index 000..5a14efb522e
--- /dev/null
+++ b/libgomp/config/linux/x86/mutex.c
@@ -0,0 +1,66 @@
+#include "../mutex.c"
+
+#ifdef __x86_64__
+static inline int
+do_spin_for_count_generic (int *addr, int val, unsigned long long count)
+{
+  unsigned long long i;
+  for (i = 0; i < count; i++)
+if (__builtin_expect (__atomic_load_n (addr, MEMMODEL_RELAXED) != val,
+ 0))
+  return 0;
+else
+  cpu_relax ();
+  return 1;
+}
+
+#ifndef __WAITPKG__
+#pragma GCC push_options
+#pragma GCC target("waitpkg")
+#define __DISABLE_WAITPKG__
+#endif /* __WAITPKG__ */
+
+static inline unsigned long long __rdtsc(void)
+{
+  unsigned long long var;
+  unsigned int hi, lo;
+
+  __asm volatile ("rdtsc" : "=a" (lo), "=d" (hi));
+
+  var = ((unsigned long long)hi << 32) | lo;
+  return var;
+}
+
+#define PAUSE_TP 200
+static inline int
+do_spin_for_backoff_tpause (int *addr, int val, unsigned long long count)
+{
+  unsigned int ctrl = 1;
+  unsigned long long wait_time = 1;
+  unsigned long long mask = 1ULL << __builtin_ia32_bsrdi(count * PAUSE_TP);
+  do