Hi All,
It looks like strength reduction is not optimal for the following example?
7 instructions per iteration is used, but 4 (or 5 without LFTR) are
necessary.
Best Regards,
Yiran Wang
bash-4.0$ cat x.c
int foo(int x, int b, int *__restrict a)
{
int i;
int c,d ;
c = b*60;
d = c+44;
for (i = 0; i< b; i++)
{
x = x+d;
*a++=x;
}
return x;
}
bash-4.0$ /opt/open64tr/bin/opencc -c -O3 -keep x.c
-Wb,-trlow,-tt25:0xffffffff -OPT:unroll_times_max=1 -march=barcelona
bash-4.0$ cat x.s
# /opt/open64tr/lib/gcc-lib/x86_64-open64-linux/5.0/be::5.0
#-----------------------------------------------------------
# Compiling x.c (x.I)
#-----------------------------------------------------------
#-----------------------------------------------------------
# Options:
#-----------------------------------------------------------
# Target:Barcelona, ISA:ISA_1, Endian:little, Pointer Size:32
# -O3 (Optimization level)
# -g0 (Debug level)
# -m2 (Report advisories)
#-----------------------------------------------------------
int foo(int x, int b, int *__restrict a)
{
int i;
int c,d ;
c = b*60;
d = c+44;
for (i = 0; i< b; i++)
{
x = x+d;
*a++=x;
}
return x;
}
bash-4.0$ /opt/open64tr/bin/opencc -c -O3 -keep x.c
-Wb,-trlow,-tt25:0xffffffff -OPT:unroll_times_max=1 -march=barcelona
bash-4.0$ cat x.s
# /opt/open64tr/lib/gcc-lib/x86_64-open64-linux/5.0/be::5.0
#-----------------------------------------------------------
# Compiling x.c (x.I)
#-----------------------------------------------------------
#-----------------------------------------------------------
# Options:
#-----------------------------------------------------------
# Target:Barcelona, ISA:ISA_1, Endian:little, Pointer Size:32
# -O3 (Optimization level)
# -g0 (Debug level)
# -m2 (Report advisories)
#-----------------------------------------------------------
.text
.align 2
.section .text
.p2align 5,,
# Program Unit: foo
.globl foo
.type foo, @function
foo: # 0x0
# .frame %esp, 16, %esp
# _temp_gra_spill0 = 0
.loc 1 2 0
# 1 int foo(int x, int b, int *__restrict a)
# 2 {
.LBB1_foo:
pushl %ebp # [0]
pushl %ebx # [3]
pushl %edi # [6]
addl $-16,%esp # [9]
movl 36(%esp),%edi # [10] b
leal -1(%edi),%eax # [13]
testl %eax,%eax # [14]
jl .Lt_0_2818 # [15]
.LBB2_foo:
movl %edi,%ebp # [0]
.loc 1 8 0
# 4 int c,d ;
# 5 c = b*60;
# 6 d = c+44;
# 7
# 8 for (i = 0; i< b; i++)
movl %edi,%ecx # [0]
movl 32(%esp),%ebx # [0] x
movl %ecx,0(%esp) # [1] _temp_gra_spill0
imull $60,%ebp # [1]
movl 40(%esp),%eax # [1] a
xorl %edx,%edx # [2]
.p2align 5,,31
.Lt_0_3586:
#<loop> Loop body line 8, nesting depth: 1, estimated iterations: 1000
.loc 1 11 0
# 9 {
# 10 x = x+d;
# 11 *a++=x;
addl $1,%edx # [0]
.loc 1 10 0
addl %ebp,%ebx # [0]
.loc 1 11 0
addl $4,%eax # [0]
.loc 1 10 0
addl $44,%ebx # [1]
.loc 1 11 0
cmpl %edi,%edx # [1]
movl %ebx,-4(%eax) # [2] id:17
jl .Lt_0_3586 # [2]
.Lt_0_4098:
.loc 1 13 0
# 12 }
# 13 return x;
movl %ebx,%eax # [0]
addl $16,%esp # [0]
popl %edi # [1]
popl %ebx # [4]
popl %ebp # [7]
ret # [7]
.p2align 5,,31
.Lt_0_2818:
.loc 1 11 0
movl 32(%esp),%eax # [0] x
.loc 1 13 0
addl $16,%esp # [0]
popl %edi # [1]
popl %ebx # [4]
popl %ebp # [7]
ret # [7]
.LDWend_foo:
.size foo, .LDWend_foo-foo
.section .text
.align 4
.section .eh_frame, "a",@progbits
.LEHCIE:
.4byte .LEHCIE_end - .LEHCIE_begin
.LEHCIE_begin:
.4byte 0x0
.byte 0x01, 0x00, 0x01, 0x7c, 0x08, 0x0c, 0x04, 0x04
.byte 0x88, 0x01
.align 4
.LEHCIE_end:
.section .debug_line, ""
.section .note.GNU-stack,"",@progbits
.ident "#Open64 Compiler Version 5.0 : x.c compiled with : -O3
-OPT:unroll_times_max=1 -march=barcelona -msse2 -msse3 -mno-3dnow
-mno-sse4a -mno-ssse3 -mno-sse41 -mno-sse42 -mno-aes -mno-pclmul -mno-avx
-mno-xop -mno-fma -mno-fma4 -m32"
.text
.align 2
.section .text
.p2align 5,,
# Program Unit: foo
.globl foo
.type foo, @function
foo: # 0x0
# .frame %esp, 16, %esp
# _temp_gra_spill0 = 0
.loc 1 2 0
# 1 int foo(int x, int b, int *__restrict a)
# 2 {
.LBB1_foo:
pushl %ebp # [0]
pushl %ebx # [3]
pushl %edi # [6]
addl $-16,%esp # [9]
movl 36(%esp),%edi # [10] b
leal -1(%edi),%eax # [13]
testl %eax,%eax # [14]
jl .Lt_0_2818 # [15]
.LBB2_foo:
movl %edi,%ebp # [0]
.loc 1 8 0
# 4 int c,d ;
# 5 c = b*60;
# 6 d = c+44;
# 7
# 8 for (i = 0; i< b; i++)
movl %edi,%ecx # [0]
movl 32(%esp),%ebx # [0] x
movl %ecx,0(%esp) # [1] _temp_gra_spill0
imull $60,%ebp # [1]
movl 40(%esp),%eax # [1] a
xorl %edx,%edx # [2]
.p2align 5,,31
.Lt_0_3586:
#<loop> Loop body line 8, nesting depth: 1, estimated iterations: 1000
.loc 1 11 0
# 9 {
# 10 x = x+d;
# 11 *a++=x;
addl $1,%edx # [0]
.loc 1 10 0
addl %ebp,%ebx # [0]
.loc 1 11 0
addl $4,%eax # [0]
.loc 1 10 0
addl $44,%ebx # [1]
.loc 1 11 0
cmpl %edi,%edx # [1]
movl %ebx,-4(%eax) # [2] id:17
jl .Lt_0_3586 # [2]
.Lt_0_4098:
.loc 1 13 0
# 12 }
# 13 return x;
movl %ebx,%eax # [0]
addl $16,%esp # [0]
popl %edi # [1]
popl %ebx # [4]
popl %ebp # [7]
ret # [7]
.p2align 5,,31
.Lt_0_2818:
.loc 1 11 0
movl 32(%esp),%eax # [0] x
.loc 1 13 0
addl $16,%esp # [0]
popl %edi # [1]
popl %ebx # [4]
popl %ebp # [7]
ret # [7]
.LDWend_foo:
.size foo, .LDWend_foo-foo
.section .text
.align 4
.section .eh_frame, "a",@progbits
.LEHCIE:
.4byte .LEHCIE_end - .LEHCIE_begin
.LEHCIE_begin:
.4byte 0x0
.byte 0x01, 0x00, 0x01, 0x7c, 0x08, 0x0c, 0x04, 0x04
.byte 0x88, 0x01
.align 4
.LEHCIE_end:
.section .debug_line, ""
.section .note.GNU-stack,"",@progbits
.ident "#Open64 Compiler Version 5.0 : x.c compiled with : -O3
-OPT:unroll_times_max=1 -march=barcelona -msse2 -msse3 -mno-3dnow
-mno-sse4a -mno-ssse3 -mno-sse41 -mno-sse42 -mno-aes -mno-pclmul -mno-avx
-mno-xop -mno-fma -mno-fma4 -m32"
------------------------------------------------------------------------------
Live Security Virtual Conference
Exclusive live event will cover all the ways today's security and
threat landscape has changed and how IT managers can respond. Discussions
will include endpoint security, mobile security and the latest in malware
threats. http://www.accelacomm.com/jaw/sfrnl04242012/114/50122263/
_______________________________________________
Open64-devel mailing list
Open64-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/open64-devel