Hello,
I have been investigating a few loops and while looking at:
void matrix_mul_vect(unsigned int N, int *C, short *A, short *B) {
unsigned int i,j;
for (i=0; i<N; i++) {
C[i]=0;
for (j=0; j<N; j++) {
C[i]+=A[i*N+j] * B[j];
}
}
}
I noticed GCC is generating worst code now (trunk and 4.8) than in 4.5.
Using x86_64 as an example (compiled with O2 and gcc 4.8):
matrix_mul_vect:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
xorl %eax, %eax
xorl %ebp, %ebp
testl %edi, %edi
pushq %rbx
.cfi_def_cfa_offset 24
.cfi_offset 3, -24
je .L1
.p2align 4,,10
.p2align 3
.L7:
movq %rcx, %r9
leal (%rax,%rdi), %ebx
xorl %r10d, %r10d
jmp .L4
.p2align 4,,10
.p2align 3
.L10:
movl %r8d, %r10d
.L4:
movl %eax, %r8d
movswl (%r9), %r11d
addl $1, %eax
movswl (%rdx,%r8,2), %r8d
addq $2, %r9
imull %r11d, %r8d
addl %r10d, %r8d
cmpl %ebx, %eax
jne .L10
movl %r8d, (%rsi,%rbp,4)
addq $1, %rbp
cmpl %ebp, %edi
ja .L7
.L1:
popq %rbx
.cfi_def_cfa_offset 16
popq %rbp
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE0:
For 4.5:
matrix_mul_vect:
.LFB0:
.cfi_startproc
pushq %r13
.cfi_def_cfa_offset 16
testl %edi, %edi
pushq %r12
.cfi_def_cfa_offset 24
pushq %rbp
.cfi_def_cfa_offset 32
pushq %rbx
.cfi_def_cfa_offset 40
je .L1
.cfi_offset 3, -40
.cfi_offset 6, -32
.cfi_offset 12, -24
.cfi_offset 13, -16
leal -1(%rdi), %ebx
xorl %r12d, %r12d
xorl %ebp, %ebp
addq $1, %rbx
leaq 0(,%rbx,4), %r13
addq %rbx, %rbx
.p2align 4,,10
.p2align 3
.L4:
movl $0, (%rsi,%rbp)
movl %r12d, %r9d
xorl %eax, %eax
xorl %r10d, %r10d
.p2align 4,,10
.p2align 3
.L3:
mov %r9d, %r8d
movswl (%rcx,%rax), %r11d
addq $2, %rax
movswl (%rdx,%r8,2), %r8d
addl $1, %r9d
imull %r11d, %r8d
addl %r8d, %r10d
cmpq %rbx, %rax
jne .L3
movl %r10d, (%rsi,%rbp)
addq $4, %rbp
addl %edi, %r12d
cmpq %r13, %rbp
jne .L4
.L1:
popq %rbx
.cfi_def_cfa_offset 32
popq %rbp
.cfi_def_cfa_offset 24
popq %r12
.cfi_def_cfa_offset 16
popq %r13
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE0:
The problem seems to be, at least partially, due to the creation of an extra
basic block that ends up being a loop latch with a useless insn. This can be
seen in the code output by gcc 4.8, label .L10.
Before register allocation blocks 4 and 5 look like (gcc 4.8):
;; basic block 4, loop depth 2, count 0, freq 9100, maybe hot
;; prev block 3, next block 5, flags: (RTL, MODIFIED)
;; pred: 3 [100.0%] (FALLTHRU)
;; 5 [100.0%] (DFS_BACK)
;; bb 4 artificial_defs: { }
;; bb 4 artificial_uses: { u18(6){ }u19(7){ }u20(16){ }u21(20){ }}
;; lr in 6 [bp] 7 [sp] 16 [argp] 20 [frame] 85 91 97 98 101 102 103 104
105
;; lr use 6 [bp] 7 [sp] 16 [argp] 20 [frame] 85 97 98 101 104
;; lr def 17 [flags] 85 96 97 106 108 109
(code_label 61 36 50 4 4 "" [1 uses])
(note 50 61 51 4 [bb 4] NOTE_INSN_BASIC_BLOCK)
(insn 51 50 52 4 (set (reg:DI 106 [ D.3277 ])
(zero_extend:DI (reg:SI 85 [ ivtmp.7 ]))) coremark.c:142 139
{*zero_extendsidi2_rex64}
(nil))
(insn 52 51 53 4 (set (reg:SI 109 [ D.3280 ])
(sign_extend:SI (mem:HI (plus:DI (mult:DI (reg:DI 106 [ D.3277 ])
(const_int 2 [0x2]))
(reg/v/f:DI 104 [ A ])) [4 *_18+0 S2 A16]))) coremark.c:142
153 {extendhisi2}
(expr_list:REG_DEAD (reg:DI 106 [ D.3277 ])
(nil)))
(insn 53 52 54 4 (set (reg:SI 108 [ D.3280 ])
(sign_extend:SI (mem:HI (reg:DI 97 [ ivtmp.8 ]) [4 MEM[base: _63,
offset: 0B]+0 S2 A16]))) coremark.c:142 153 {extendhisi2}
(nil))
(insn 54 53 55 4 (parallel [
(set (reg:SI 109 [ D.3280 ])
(mult:SI (reg:SI 109 [ D.3280 ])
(reg:SI 108 [ D.3280 ])))
(clobber (reg:CC 17 flags))
]) coremark.c:142 353 {*mulsi3_1}
(expr_list:REG_DEAD (reg:SI 108 [ D.3280 ])
(expr_list:REG_UNUSED (reg:CC 17 flags)
(nil))))
(insn 55 54 56 4 (parallel [
(set (reg:SI 96 [ D.3280 ])
(plus:SI (reg:SI 109 [ D.3280 ])
(reg:SI 101 [ D.3283 ])))
(clobber (reg:CC 17 flags))
]) coremark.c:142 273 {*addsi_1}
(expr_list:REG_DEAD (reg:SI 109 [ D.3280 ])
(expr_list:REG_DEAD (reg:SI 101 [ D.3283 ])
(expr_list:REG_UNUSED (reg:CC 17 flags)
(nil)))))
(insn 56 55 57 4 (parallel [
(set (reg:SI 85 [ ivtmp.7 ])
(plus:SI (reg:SI 85 [ ivtmp.7 ])
(const_int 1 [0x1])))
(clobber (reg:CC 17 flags))
]) 273 {*addsi_1}
(expr_list:REG_UNUSED (reg:CC 17 flags)
(nil)))
(insn 57 56 58 4 (parallel [
(set (reg:DI 97 [ ivtmp.8 ])
(plus:DI (reg:DI 97 [ ivtmp.8 ])
(const_int 2 [0x2])))
(clobber (reg:CC 17 flags))
]) 274 {*adddi_1}
(expr_list:REG_UNUSED (reg:CC 17 flags)
(nil)))
(insn 58 57 59 4 (set (reg:CCZ 17 flags)
(compare:CCZ (reg:SI 85 [ ivtmp.7 ])
(reg:SI 98 [ D.3281 ]))) coremark.c:141 7 {*cmpsi_1}
(nil))
(jump_insn 59 58 60 4 (set (pc)
(if_then_else (eq (reg:CCZ 17 flags)
(const_int 0 [0]))
(label_ref 64)
(pc))) coremark.c:141 621 {*jcc_1}
(expr_list:REG_DEAD (reg:CCZ 17 flags)
(expr_list:REG_BR_PROB (const_int 900 [0x384])
(nil)))
-> 64)
;; succ: 5 [91.0%] (FALLTHRU)
;; 6 [9.0%] (LOOP_EXIT)
;; lr out 6 [bp] 7 [sp] 16 [argp] 20 [frame] 85 91 96 97 98 102 103 104
105
;; basic block 5, loop depth 2, count 0, freq 8281, maybe hot
;; prev block 4, next block 6, flags: (RTL, MODIFIED)
;; pred: 4 [91.0%] (FALLTHRU)
;; bb 5 artificial_defs: { }
;; bb 5 artificial_uses: { u35(6){ }u36(7){ }u37(16){ }u38(20){ }}
;; lr in 6 [bp] 7 [sp] 16 [argp] 20 [frame] 85 91 96 97 98 102 103 104
105
;; lr use 6 [bp] 7 [sp] 16 [argp] 20 [frame] 96
;; lr def 101
(note 60 59 34 5 [bb 5] NOTE_INSN_BASIC_BLOCK)
(insn 34 60 84 5 (set (reg:SI 101 [ D.3283 ])
(reg:SI 96 [ D.3280 ])) coremark.c:141 89 {*movsi_internal}
(expr_list:REG_DEAD (reg:SI 96 [ D.3280 ])
(nil)))
(jump_insn 84 34 85 5 (set (pc)
(label_ref 61)) 659 {jump}
(nil)
-> 61)
;; succ: 4 [100.0%] (DFS_BACK)
;; lr out 6 [bp] 7 [sp] 16 [argp] 20 [frame] 85 91 97 98 101 102 103 104
105
However, registers 96 and 101 only show up in block 4 here:
(insn 55 54 56 4 (parallel [
(set (reg:SI 96 [ D.3280 ])
(plus:SI (reg:SI 109 [ D.3280 ])
(reg:SI 101 [ D.3283 ])))
(clobber (reg:CC 17 flags))
]) coremark.c:142 273 {*addsi_1}
(expr_list:REG_DEAD (reg:SI 109 [ D.3280 ])
(expr_list:REG_DEAD (reg:SI 101 [ D.3283 ])
(expr_list:REG_UNUSED (reg:CC 17 flags)
(nil)))))
I don't really understand why GCC is generating this extra basic block however
it looks like a regression. Block 5 could disappear and insn 55 rewritten as
(replaced reg101 by reg96):
(set (reg:SI 96 [ D.3280 ])
(plus:SI (reg:SI 109 [ D.3280 ])
(reg:SI 96 [ D.3283 ])))
It is worse on my port due to several technical constraints. For example, zero
overhead loops can only be generated if there are no internal branches, which
means that any loop without an empty latch can't be transformed into a zero
overhead loop, which is the case with this example. So, loops with empty
latches are desired as much as possible. Once GCC starts creating code like
this performance on my port drops drastically.
What are your thoughts?
Note: I just noticed this doesn't happen in trunk anymore. Any idea of what
went into trunk to fix this?
Paulo Matos