On 8/18/25 14:37, Sebastian Huber wrote:
----- Am 18. Aug 2025 um 9:33 schrieb Jørgen Kvalsvik j...@lambda.is:

On 8/18/25 02:58, Sebastian Huber wrote:
Hello,

I have question to the counters used for the condition coverage implementation
in tree-profile.cc

/* Stores the incoming edge and previous counters (in SSA form) on that edge
     for the node e->deston that edge for the node e->dest.  The counters record
     the seen-true (0), seen-false (1), and current-mask (2).  They are stored 
in
     an array rather than proper members for access-by-index as the code paths
     tend to be identical for the different counters.  */
struct counters
{
      edge e;
      tree counter[3];
      tree& operator [] (size_t i) { return counter[i]; }
};

While working on the -fprofile-update=atomic support for 32-bit targets which
lack support for 64-bit atomic operations, I noticed that some atomic
no-operations are generated for the instrumented code
(https://gcc.gnu.org/pipermail/gcc-patches/2025-August/692555.html). For
example:

int a(int i);
int b(int i);

int f(int i)
{
      if (i) {
        return a(i);
      } else {
        return b(i);
      }
}

gcc -O2 -fprofile-update=atomic -fcondition-coverage -S -o - test.c
-fdump-tree-all

;; Function f (f, funcdef_no=0, decl_uid=4621, cgraph_uid=1, symbol_order=0)

int f (int i)
{
    int _1;
    int _6;
    int _8;

    <bb 2> [local count: 1073741824]:
    if (i_3(D) != 0)
      goto <bb 3>; [50.00%]
    else
      goto <bb 4>; [50.00%]

    <bb 3> [local count: 536870912]:
    __atomic_fetch_or_8 (&__gcov8.f[0], 1, 0);
    __atomic_fetch_or_8 (&__gcov8.f[1], 0, 0);
    _8 = a (i_3(D)); [tail call]
    goto <bb 5>; [100.00%]

    <bb 4> [local count: 536870912]:
    __atomic_fetch_or_8 (&__gcov8.f[0], 0, 0);
    __atomic_fetch_or_8 (&__gcov8.f[1], 1, 0);
    _6 = b (0); [tail call]

    <bb 5> [local count: 1073741824]:
    # _1 = PHI <_8(3), _6(4)>
    return _1;

}

The __atomic_fetch_or_8 (&__gcov8.f[1], 0, 0) and __atomic_fetch_or_8
(&__gcov8.f[0], 0, 0) could be optimized away. Since GCC is able to figure out
that the masks are compile-time constants wouldn't it be possible to use a
simple uint64_t for the current-mask (2) in struct counters?

Is this something you're seeing consistently, even when the number of
conditions go up?

I only looked at simple test cases while working on the support for splitting 
up the atomic operations. In a slightly more complicated test function

int a(int i);
int b(int i);
int c(int i);

int f(int i, int j, int k, int l, int m)
{
   if (i && j && (k || !l || m)) {
     if (a(i) || b(i) || l) {
       return a(i);
     } {
       return c(i);
     }
   } else if (k && m) {
     return b(i);
   } else {
     return 123;
   }
}

all the masks are still compile-time constants:

;; Function f (f, funcdef_no=0, decl_uid=4627, cgraph_uid=1, symbol_order=0)

Removing basic block 20
int f (int i, int j, int k, int l, int m)
{
   _Bool _1;
   _Bool _2;
   _Bool _3;
   _Bool _5;
   _Bool _6;
   int _7;
   int _8;
   _Bool _10;
   _Bool _11;
   int _12;
   int _24;
   int _26;
   int _28;
   _Bool _76;

   <bb 2> [local count: 1073741822]:
   _1 = i_15(D) != 0;
   _2 = j_16(D) != 0;
   _3 = _1 & _2;
   _76 = k_17(D) != 0;
   if (_3 != 0)
     goto <bb 3>; [50.00%]
   else
     goto <bb 16>; [50.00%]

   <bb 3> [local count: 536870911]:
   _5 = l_18(D) == 0;
   _6 = _5 | _76;
   if (_6 != 0)
     goto <bb 4>; [33.00%]
   else
     goto <bb 5>; [67.00%]

   <bb 4> [local count: 177167400]:
   __atomic_fetch_or_8 (&__gcov8.f[0], 3, 0);
   __atomic_fetch_or_8 (&__gcov8.f[1], 0, 0);
   goto <bb 8>; [100.00%]

   <bb 5> [local count: 359703512]:
   if (m_19(D) != 0)
     goto <bb 7>; [50.00%]
   else
     goto <bb 6>; [50.00%]

   <bb 6> [local count: 179851756]:
   __atomic_fetch_or_8 (&__gcov8.f[0], 0, 0);
   __atomic_fetch_or_8 (&__gcov8.f[1], 6, 0);
   goto <bb 17>; [100.00%]

   <bb 7> [local count: 179851756]:
   __atomic_fetch_or_8 (&__gcov8.f[0], 5, 0);
   __atomic_fetch_or_8 (&__gcov8.f[1], 0, 0);

   <bb 8> [local count: 357019156]:
   _7 = a (i_15(D));
   if (_7 != 0)
     goto <bb 9>; [34.00%]
   else
     goto <bb 10>; [66.00%]

   <bb 9> [local count: 121386514]:
   __atomic_fetch_or_8 (&__gcov8.f[2], 1, 0);
   __atomic_fetch_or_8 (&__gcov8.f[3], 0, 0);
   goto <bb 14>; [100.00%]

   <bb 10> [local count: 235632641]:
   _8 = b (i_15(D));
   if (_8 != 0)
     goto <bb 11>; [34.00%]
   else
     goto <bb 12>; [66.00%]

   <bb 11> [local count: 80115099]:
   __atomic_fetch_or_8 (&__gcov8.f[2], 2, 0);
   __atomic_fetch_or_8 (&__gcov8.f[3], 0, 0);
   goto <bb 14>; [100.00%]

   <bb 12> [local count: 155517543]:
   if (l_18(D) != 0)
     goto <bb 13>; [67.00%]
   else
     goto <bb 15>; [33.00%]

   <bb 13> [local count: 104196754]:
   __atomic_fetch_or_8 (&__gcov8.f[2], 4, 0);
   __atomic_fetch_or_8 (&__gcov8.f[3], 0, 0);

   <bb 14> [local count: 305698367]:
   _26 = a (i_15(D)); [tail call]
   goto <bb 19>; [100.00%]

   <bb 15> [local count: 51320789]:
   __atomic_fetch_or_8 (&__gcov8.f[2], 0, 0);
   __atomic_fetch_or_8 (&__gcov8.f[3], 7, 0);
   _24 = c (i_15(D)); [tail call]
   goto <bb 19>; [100.00%]

   <bb 16> [local count: 536870911]:
   __atomic_fetch_or_8 (&__gcov8.f[0], 0, 0);
   __atomic_fetch_or_8 (&__gcov8.f[1], 1, 0);
   _10 = m_19(D) != 0;
   _11 = _10 & _76;
   if (_11 != 0)
     goto <bb 18>; [63.77%]
   else
     goto <bb 17>; [36.23%]

   <bb 17> [local count: 374344247]:
   __atomic_fetch_or_8 (&__gcov8.f[4], 0, 0);
   __atomic_fetch_or_8 (&__gcov8.f[5], 1, 0);
   goto <bb 19>; [100.00%]

   <bb 18> [local count: 342378420]:
   __atomic_fetch_or_8 (&__gcov8.f[4], 1, 0);
   __atomic_fetch_or_8 (&__gcov8.f[5], 0, 0);
   _28 = b (i_15(D)); [tail call]

   <bb 19> [local count: 1073741824]:
   # _12 = PHI <_26(14), _24(15), _28(18), 123(17)>
   return _12;

}

I tried this program:

int a(int i);
int b(int i);
int c(int i);

int f(int i, int j, int k, int l, int m)
{
  if (i && j && (k || !l || m)) {
    if (a(i) || b(i) || l) {
      return a(i);
    } {
      return c(i);
    }
  } else if (k && m) {
    return b(i);
  } else {
    return 123;
  }
}

And compiled it like you did: gcc -O2 -fprofile-update=atomic -fcondition-coverage -S -o - test2.c -fdump-tree-all

Most of the atomics operate on consts, but there is this:

  <bb 9> [local count: 234572472]:
  _39 = _35 | 16;
  _40 = _36 | 1;
  _53 = ~_40;
  _54 = _34 & _53;
  _55 = _39 & _53;
  __atomic_fetch_or_8 (&__gcov8.f[0], _54, 0);
__atomic_fetch_or_8 (&__gcov8.f[1], _55, 0);


We could still probably detect the zero from the instrumentation, but this is a case where the ior is path dependent. I would think a proper elimination is better tied to the args to __atomic_fetch though, it must surely be able to detect that it is a no-op?

Thanks,
Jørgen



I'm sure it's possible to optimize out this case by checking if the
current-mask still is the initial zero constant. The inputs that
determine the current-mask are constant and tied to the node, but the
final state of current-mask once the counters are flushed depends on the
path taken. I'll try to think a bit more about it, but I don't think it
can be replaced by a uint64_t without a redesign of the
instrument_decisions function.

Ok, thanks for having a look at it.


Reply via email to