https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65468

            Bug ID: 65468
           Summary: Optimize static schedule with chunk_size one
           Product: gcc
           Version: 5.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: vries at gcc dot gnu.org

Consider test.c:
...
extern void abort ();

int
bar ()
{
  int a = 0, i;

#pragma omp parallel for num_threads (3) reduction (+:a) schedule(static, 1)
  for (i = 0; i < 10; i++)
    a += i;

  return a;
}

int
main (void)
{
  int res;
  res = bar ();
  if (res != 45)
    abort ();
  return 0;
}
...


So, we create 3 threads, and the schedule will be:
threadnr | iterations
---------------------
0        | 0 3 6 9
1        | 1 4 7
2        | 2 5 8


The code is generated using expand_for_omp_static_chunk, which results in the
following code for -O2 -fopenmp (optimized dump):
...
bar._omp_fn.0 (struct .omp_data_s.0 & restrict .omp_data_i)
{
  int i;
  int a;
  int _6;
  int _11;
  int * _17;
  int _21;
  unsigned int _23;
  int _25;
  int _26;
  unsigned int _27;
  int _29;
  unsigned int _31;
  unsigned int _32;
  int _33;
  unsigned int _34;
  unsigned int pretmp_35;
  unsigned int prephitmp_36;

  <bb 2>:
  _6 = __builtin_omp_get_num_threads ();
  i_7 = __builtin_omp_get_thread_num ();
  _25 = i_7 + 1;
  _26 = MIN_EXPR <_25, 10>;
  if (i_7 <= 9)
    goto <bb 3>;
  else
    goto <bb 8>;

  <bb 3>:
  # a_3 = PHI <0(2)>
  # i_24 = PHI <i_7(2)>
  # _21 = PHI <_26(2)>

  <bb 4>:
  # a_12 = PHI <a_3(3), a_13(6)>
  # i_5 = PHI <i_24(3), i_22(6)>
  # _29 = PHI <_21(3), _11(6)>

  <bb 5>:
  # a_1 = PHI <a_12(4), a_13(5)>
  # i_4 = PHI <i_5(4), i_14(5)>
  a_13 = a_1 + i_4;
  i_14 = i_4 + 1;
  if (i_14 < _29)
    goto <bb 5>;
  else
    goto <bb 6>;

  <bb 6>:
  _32 = (unsigned int) i_5;
  _31 = (unsigned int) _6;
  _23 = _31 + _32;
  i_22 = (int) _23;
  _27 = _23;
  _34 = _27 + 1;
  _33 = (int) _34;
  _11 = MIN_EXPR <_33, 10>;
  if (i_22 <= 9)
    goto <bb 4>;
  else
    goto <bb 7>;

  <bb 7>:
  pretmp_35 = (unsigned int) a_13;

  <bb 8>:
  # prephitmp_36 = PHI <pretmp_35(7), 0(2)>
  _17 = &.omp_data_i_16(D)->a;
  __atomic_fetch_add_4 (_17, prephitmp_36, 0); [tail call]
  return;

}
...

The code contains a loop nest with two loops. The inner loop handles a single
chunk, the outer loop iterates over the chunks assigned to the thread.

For a chunk size of one, we know that the inner loop will only execute the body
once at all times. But the compiler doesn't manage to optimize the inner loop
away.

Reply via email to