https://gcc.gnu.org/bugzilla/show_bug.cgi?id=33717
--- Comment #6 from Andrew Pinski <pinskia at gcc dot gnu.org> --- GCC does better now since GCC 10: .L2: movl (%ebx,%ecx,4), %eax xorl %edx, %edx addl $-1, %eax adcl $0, %edx addl %eax, %esi adcl %edx, %edi movl %esi, -424(%ebp,%ecx,4) addl $1, %ecx movl %edi, %esi xorl %edi, %edi cmpl $100, %ecx jne .L2 But still not as good as ICC: ..B1.4: # Preds ..B1.4 ..B1.3 # Execution count [5.00e+01] addl 32(%esp,%eax,8), %ebx #25.5 movl %esi, %edx #25.5 adcl $0, %edx #25.5 addl 432(%esp,%eax,8), %ebx #25.37 movl %ebx, 832(%esp,%eax,8) #26.5 movl %esi, %ebx #25.5 adcl $0, %edx #25.37 addl 36(%esp,%eax,8), %edx #25.5 adcl $0, %ebx #25.5 addl 436(%esp,%eax,8), %edx #25.37 movl %edx, 836(%esp,%eax,8) #26.5 adcl $0, %ebx #25.37 incl %eax #24.3 cmpl $50, %eax #24.3 jb ..B1.4 # Prob 98% LLVM just falls over: .LBB0_1: # =>This Inner Loop Header: Depth=1 xorl %ebx, %ebx addl 808(%esp,%edx,4), %eax setb %bl addl 408(%esp,%edx,4), %eax adcl $0, %ebx movl %eax, 8(%esp,%edx,4) cmpl $100, %edx je .LBB0_3 # %bb.2: # in Loop: Header=BB0_1 Depth=1 xorl %eax, %eax addl 812(%esp,%edx,4), %ebx setb %al addl 412(%esp,%edx,4), %ebx adcl $0, %eax movl %ebx, 12(%esp,%edx,4) addl $2, %edx jmp .LBB0_1 .LBB0_3: