Hi,
so I found the problem.  We duplicate multiple paths and end up with:

;; basic block 6, loop depth 0, count 365072224 (estimated locally, freq 0.3400)
;;  prev block 12, next block 7, flags: (NEW, REACHABLE, VISITED)
;;  pred:       4 [never (guessed)]  count:0 (estimated locally, freq 0.0000) 
(TRUE_VALUE,EXECUTABLE)
;;              10 [always]  count:365072224 (estimated locally, freq 0.3400) 
(FALLTHRU,EXECUTABLE)
# _18 = PHI <0(4), 0(10)>
# d_39 = PHI <d_40(D)(4), d_40(D)(10)>
if (_18 == 0)
  goto <bb 8>; [97.06%]
else
  goto <bb 7>; [2.94%]
;;  succ:       8 [97.1% (guessed)]  count:354334801 (estimated locally, freq 
0.3300) (TRUE_VALUE,EXECUTABLE)
;;              7 [2.9% (guessed)]  count:10737423 (estimated locally, freq 
0.0100) (FALSE_VALUE,EXECUTABLE)

Here goto bb 7 is never taken but profile is wrong.

Before threading we have chain of conditionals:

  __asm__("pushf{l|d}
        pushf{l|d}
        pop{l}  %0
        mov{l}  {%0, %1|%1, %0}
        xor{l}  {%2, %0|%0, %2}
        push{l} %0
        popf{l|d}
        pushf{l|d}
        pop{l}  %0
        popf{l|d}
        " : "=&r" __eax_19, "=&r" __ebx_20 : "i" 2097152);
  _21 = __eax_19 ^ __ebx_20;
  _22 = _21 & 2097152;
  if (_22 == 0)
    goto <bb 4>; [34.00%]
  else
    goto <bb 3>; [66.00%]
  
  <bb 3> [local count: 708669602 freq: 0.660000]:
  __asm__ __volatile__("cpuid
        " : "=a" __eax_24, "=b" __ebx_25, "=c" __ecx_26, "=d" __edx_27 : "0" 0);
  
  <bb 4> [local count: 1073741826 freq: 1.000000]:
  # _33 = PHI <0(2), __eax_24(3)> 
  _16 = _33 == 0;
  if (_33 == 0)
    goto <bb 6>; [34.00%]
  else
    goto <bb 5>; [66.00%]

  <bb 5> [local count: 708669600 freq: 0.660000]:
  __asm__ __volatile__("cpuid
        " : "=a" a_44, "=b" b_45, "=c" c_46, "=d" d_47 : "0" 1, "2" 0);

  <bb 6> [local count: 1073741824 freq: 1.000000]:
  # _18 = PHI <0(4), 1(5)>
  # d_39 = PHI <d_40(D)(4), d_47(5)>
  if (_18 == 0)
    goto <bb 8>; [33.00%]
  else
    goto <bb 7>; [67.00%]


If first _22 == 0 then also _33 == 0 and _18 == 0 but the last case has
probability 33% while the first 34%, so the profile guess is not
consistent with the threaded path.  So threading is right to end up with
profile inconsistency, but it should print reason for doing it.

One option is to disable optimization for the check.  Other option is to
get the first conditional predicted right.
Would this be OK?

gcc/ChangeLog:

        * config/i386/cpuid.h: Mark CPUs not supporting cpuid as unlikely.

diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h
index 03fd6fc9478..9c768ac0b6d 100644
--- a/gcc/config/i386/cpuid.h
+++ b/gcc/config/i386/cpuid.h
@@ -295,7 +295,7 @@ __get_cpuid_max (unsigned int __ext, unsigned int *__sig)
           : "i" (0x00200000));
 #endif
 
-  if (!((__eax ^ __ebx) & 0x00200000))
+  if (__builtin_expect (!((__eax ^ __ebx) & 0x00200000), 0))
     return 0;
 #endif
 

Reply via email to