From 15faf122ea37bedb87e968e7d8e641e918973161 Mon Sep 17 00:00:00 2001
From: Sebastian Pop <spop@nvidia.com>
Date: Fri, 25 Jul 2025 17:55:03 +0200
Subject: [PATCH] tree-parloops: Enable runtime thread detection with
 -ftree-parallelize-loops-auto

This patch adds runtime thread count detection to auto-parallelization.
-ftree-parallelize-loops-auto option generates parallelized loops without
specifying a fixed thread count, deferring this decision to program execution
time where it is controlled by the OMP_NUM_THREADS environment variable.

Bootstrap and regression tested on aarch64-linux.  Compiled SPEC HPC pot3d
https://www.spec.org/hpc2021/docs/benchmarks/628.pot3d_s.html with
-ftree-parallelize-loops=0 and tested without having OMP_NUM_THREADS set in the
environment and with OMP_NUM_THREADS set to different values.

gcc/ChangeLog:

	* doc/invoke.texi (ftree-parallelize-loops-auto): Document.
	* common.opt (ftree-parallelize-loops-auto): New flag for
	runtime thread detection via OMP_NUM_THREADS.
	* builtins.def (DEF_GOMP_BUILTIN): Enable OpenMP builtins for
	the auto flag.
        * tree-parloops.cc (gate): Allow pass execution for auto flag.
	(parallelize_loops): Use -1U for runtime detection when auto flag set.
	Handle -1U in thread count checks and OpenMP generation.
	* gcc.cc (LINK_SPEC, GOMP_SELF_SPECS): Add automatic libgomp
	and pthread linking for the auto flag.

gcc/testsuite/ChangeLog:

	* gcc.dg/autopar/runtime-auto.c: New test.

Signed-off-by: Sebastian Pop <spop@nvidia.com>
---
 gcc/builtins.def                            |  3 +-
 gcc/common.opt                              |  4 ++
 gcc/doc/invoke.texi                         | 23 +++++++--
 gcc/gcc.cc                                  |  4 +-
 gcc/testsuite/gcc.dg/autopar/runtime-auto.c | 52 +++++++++++++++++++++
 gcc/tree-parloops.cc                        | 32 +++++++++----
 6 files changed, 104 insertions(+), 14 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/autopar/runtime-auto.c

diff --git a/gcc/builtins.def b/gcc/builtins.def
index f6f3e104f6a..a1f6f0dc553 100644
--- a/gcc/builtins.def
+++ b/gcc/builtins.def
@@ -223,7 +223,8 @@ along with GCC; see the file COPYING3.  If not see
                false, true, true, ATTRS, false, \
 	       (flag_openacc \
 		|| flag_openmp \
-		|| flag_tree_parallelize_loops > 1))
+		|| flag_tree_parallelize_loops > 1 \
+		|| flag_tree_parallelize_loops_auto))
 
 /* Builtin used by the implementation of GNU TM.  These
    functions are mapped to the actual implementation of the STM library. */
diff --git a/gcc/common.opt b/gcc/common.opt
index ea39f87ae71..576927eb64e 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -3181,6 +3181,10 @@ ftree-parallelize-loops=
 Common Joined RejectNegative UInteger Var(flag_tree_parallelize_loops) Init(1) Optimization
 -ftree-parallelize-loops=<number>	Enable automatic parallelization of loops.
 
+ftree-parallelize-loops-auto
+Common Var(flag_tree_parallelize_loops_auto) Optimization
+Enable automatic parallelization of loops with runtime thread detection via OMP_NUM_THREADS.
+
 ftree-phiprop
 Common Var(flag_tree_phiprop) Init(1) Optimization
 Enable hoisting loads from conditional pointers.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index e0a641213ae..8638e312e2b 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -14102,14 +14102,31 @@ Perform induction variable optimizations (strength reduction, induction
 variable merging and induction variable elimination) on trees.
 
 @opindex ftree-parallelize-loops
-@item -ftree-parallelize-loops=n
-Parallelize loops, i.e., split their iteration space to run in n threads.
+@item -ftree-parallelize-loops=@var{n}
+Parallelize loops, i.e., split their iteration space to run in @var{n} threads.
 This is only possible for loops whose iterations are independent
 and can be arbitrarily reordered.  The optimization is only
 profitable on multiprocessor machines, for loops that are CPU-intensive,
 rather than constrained e.g.@: by memory bandwidth.  This option
 implies @option{-pthread}, and thus is only supported on targets
-that have support for @option{-pthread}.
+that have support for @option{-pthread}.  A positive value for @var{n}
+corresponds to the number of threads to be created by the compiler and
+cannot be changed after compilation: the number of threads is set by
+the compiler with the num_threads clause to the "#pragma omp parallel
+num_threads(@var{n})".
+
+@opindex ftree-parallelize-loops-auto
+@item -ftree-parallelize-loops-auto
+Enable automatic parallelization of loops with runtime thread count
+detection.  Unlike @option{-ftree-parallelize-loops=@var{n}} which
+fixes the number of threads at compile time, this option generates
+parallelized code that determines the thread count at program
+execution time.  The number of threads is controlled by the
+@env{OMP_NUM_THREADS} environment variable. If @env{OMP_NUM_THREADS}
+is not set, the OpenMP runtime automatically detects the number of
+available processors and uses that value.  This option is useful for
+creating binaries that can adapt to different hardware configurations
+without recompilation.
 
 @opindex ftree-pta
 @item -ftree-pta
diff --git a/gcc/gcc.cc b/gcc/gcc.cc
index 00f93d00f96..8a4f9cc986e 100644
--- a/gcc/gcc.cc
+++ b/gcc/gcc.cc
@@ -1161,7 +1161,7 @@ proper position among the other output files.  */
     %{s} %{t} %{u*} %{z} %{Z} %{!nostdlib:%{!r:%{!nostartfiles:%S}}} \
     %{static|no-pie|static-pie:} %@{L*} %(link_libgcc) " \
     VTABLE_VERIFICATION_SPEC " " SANITIZER_EARLY_SPEC " %o "" \
-    %{fopenacc|fopenmp|%:gt(%{ftree-parallelize-loops=*:%*} 1):\
+    %{fopenacc|fopenmp|%:gt(%{ftree-parallelize-loops=*:%*} 1)|ftree-parallelize-loops-auto:\
 	%:include(libgomp.spec)%(link_gomp)}\
     %{fgnu-tm:%:include(libitm.spec)%(link_itm)}\
     " STACK_SPLIT_SPEC "\
@@ -1342,7 +1342,7 @@ static const char *const multilib_defaults_raw[] = MULTILIB_DEFAULTS;
    for targets that use different start files and suchlike.  */
 #ifndef GOMP_SELF_SPECS
 #define GOMP_SELF_SPECS \
-  "%{fopenacc|fopenmp|%:gt(%{ftree-parallelize-loops=*:%*} 1): " \
+  "%{fopenacc|fopenmp|%:gt(%{ftree-parallelize-loops=*:%*} 1)|ftree-parallelize-loops-auto: " \
   "-pthread}"
 #endif
 
diff --git a/gcc/testsuite/gcc.dg/autopar/runtime-auto.c b/gcc/testsuite/gcc.dg/autopar/runtime-auto.c
new file mode 100644
index 00000000000..d58e8605574
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/autopar/runtime-auto.c
@@ -0,0 +1,52 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-parallelize-loops-auto -fdump-tree-parloops2-details" } */
+
+void abort (void);
+
+#define N 1000
+
+int a[N], b[N], c[N];
+
+void
+test_parallel_loop (void)
+{
+  int i;
+
+  /* This loop should be auto-parallelized when -ftree-parallelize-loops-auto
+     is used for runtime thread detection via OMP_NUM_THREADS.  */
+  for (i = 0; i < N; i++)
+    a[i] = b[i] + c[i];
+}
+
+int
+main (void)
+{
+  int i;
+
+  for (i = 0; i < N; i++)
+    {
+      b[i] = i;
+      c[i] = i * 2;
+    }
+
+  test_parallel_loop ();
+
+  for (i = 0; i < N; i++)
+    {
+      if (a[i] != b[i] + c[i])
+	abort ();
+    }
+
+  return 0;
+}
+
+/* Check that the loop is parallelized with runtime thread detection.  */
+/* { dg-final { scan-tree-dump "parallelizing" "parloops2" } } */
+
+/* Check that "#pragma omp parallel" is generated.  */
+/* { dg-final { scan-tree-dump "pragma omp parallel" "parloops2" } } */
+
+/* Check that instead of generating a num_threads(x) clause, the compiler calls
+   "__builtin_omp_get_num_threads" that will set the number of threads at
+   program execution time.  */
+/* { dg-final { scan-tree-dump "__builtin_omp_get_num_threads" "parloops2" } } */
diff --git a/gcc/tree-parloops.cc b/gcc/tree-parloops.cc
index 888a834faf9..0d6b629d4fd 100644
--- a/gcc/tree-parloops.cc
+++ b/gcc/tree-parloops.cc
@@ -2768,11 +2768,21 @@ create_parallel_loop (class loop *loop, tree loop_fn, tree data,
       basic_block paral_bb = single_pred (bb);
       gsi = gsi_last_bb (paral_bb);
 
-      gcc_checking_assert (n_threads != 0);
-      t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
-      OMP_CLAUSE_NUM_THREADS_EXPR (t)
-	= build_int_cst (integer_type_node, n_threads);
-      omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
+      /* Build the OMP_CLAUSE_NUM_THREADS clause only if we have a fixed
+	 thread count.  If n_threads is -1U, let OpenMP runtime determine
+	 the thread count from OMP_NUM_THREADS environment variable.  */
+      if (n_threads > 0 && n_threads != -1U)
+	{
+	  t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
+	  OMP_CLAUSE_NUM_THREADS_EXPR (t)
+	    = build_int_cst (integer_type_node, n_threads);
+	  omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
+	}
+      else
+	{
+	  /* No hardcoded thread count, let OpenMP runtime decide.  */
+	  omp_par_stmt = gimple_build_omp_parallel (NULL, NULL_TREE, loop_fn, data);
+	}
       gimple_set_location (omp_par_stmt, loc);
 
       gsi_insert_after (&gsi, omp_par_stmt, GSI_NEW_STMT);
@@ -3058,10 +3068,12 @@ gen_parallel_loop (class loop *loop,
       else
 	m_p_thread=MIN_PER_THREAD;
 
-      gcc_checking_assert (n_threads != 0);
+      /* For runtime thread detection (n_threads == -1U), use a conservative
+	 estimate of 2 threads for the many iterations condition check.  */
+      unsigned threads_for_check = (n_threads > 0 && n_threads != -1U) ? n_threads : 2;
       many_iterations_cond =
 	fold_build2 (GE_EXPR, boolean_type_node,
-		     nit, build_int_cst (type, m_p_thread * n_threads - 1));
+		     nit, build_int_cst (type, m_p_thread * threads_for_check - 1));
 
       many_iterations_cond
 	= fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
@@ -4020,6 +4032,9 @@ parallelize_loops (bool oacc_kernels_p)
      the argument to -ftree-parallelize-loops.  */
   if (oacc_kernels_p)
     n_threads = 0;
+  /* Use runtime thread detection with -ftree-parallelize-loops-auto.  */
+  else if (flag_tree_parallelize_loops_auto)
+    n_threads = -1U;
   else
     n_threads = flag_tree_parallelize_loops;
 
@@ -4095,6 +4110,7 @@ parallelize_loops (bool oacc_kernels_p)
       if (!flag_loop_parallelize_all
 	  && !oacc_kernels_p
 	  && ((estimated != -1
+	       && n_threads > 0 && n_threads != -1U
 	       && (estimated
 		   < ((HOST_WIDE_INT) n_threads
 		      * (loop->inner ? 2 : MIN_PER_THREAD) - 1)))
@@ -4186,7 +4202,7 @@ public:
     if (oacc_kernels_p)
       return flag_openacc;
     else
-      return flag_tree_parallelize_loops > 1;
+      return flag_tree_parallelize_loops > 1 || flag_tree_parallelize_loops_auto;
   }
   unsigned int execute (function *) final override;
   opt_pass * clone () final override
-- 
2.45.2

