Hi All,
It looks like strength reduction is not optimal for the following example?
7 instructions per iteration is used, but 4 (or 5 without LFTR) are
necessary.
Best Regards,
Yiran Wang
bash-4.0$ cat x.c
int foo(int x, int b, int *__restrict a)
{
int i;
int c,d ;
c = b*60;
d = c+44;
for (i = 0; i< b; i++)
{
x = x+d;
*a++=x;
}
return x;
}
bash-4.0$ /opt/open64tr/bin/opencc -c -O3 -keep x.c
-Wb,-trlow,-tt25:0xffffffff -OPT:unroll_times_max=1 -march=barcelona
bash-4.0$ cat x.s
# /opt/open64tr/lib/gcc-lib/x86_64-open64-linux/5.0/be::5.0
#-----------------------------------------------------------
# Compiling x.c (x.I)
#-----------------------------------------------------------
#-----------------------------------------------------------
# Options:
#-----------------------------------------------------------
# Target:Barcelona, ISA:ISA_1, Endian:little, Pointer Size:32
# -O3(Optimization level)
# -g0(Debug level)
# -m2(Report advisories)
#-----------------------------------------------------------
int foo(int x, int b, int *__restrict a)
{
int i;
int c,d ;
c = b*60;
d = c+44;
for (i = 0; i< b; i++)
{
x = x+d;
*a++=x;
}
return x;
}
bash-4.0$ /opt/open64tr/bin/opencc -c -O3 -keep x.c
-Wb,-trlow,-tt25:0xffffffff -OPT:unroll_times_max=1 -march=barcelona
bash-4.0$ cat x.s
# /opt/open64tr/lib/gcc-lib/x86_64-open64-linux/5.0/be::5.0
#-----------------------------------------------------------
# Compiling x.c (x.I)
#-----------------------------------------------------------
#-----------------------------------------------------------
# Options:
#-----------------------------------------------------------
# Target:Barcelona, ISA:ISA_1, Endian:little, Pointer Size:32
# -O3(Optimization level)
# -g0(Debug level)
# -m2(Report advisories)
#-----------------------------------------------------------
.text
.align2
.section .text
.p2align 5,,
# Program Unit: foo
.globlfoo
.typefoo, @function
foo:# 0x0
# .frame%esp, 16, %esp
# _temp_gra_spill0 = 0
.loc120
# 1 int foo(int x, int b, int *__restrict a)
# 2 {
.LBB1_foo:
pushl %ebp # [0]
pushl %ebx # [3]
pushl %edi # [6]
addl $-16,%esp # [9]
movl 36(%esp),%edi # [10] b
leal -1(%edi),%eax # [13]
testl %eax,%eax # [14]
jl .Lt_0_2818 # [15]
.LBB2_foo:
movl %edi,%ebp # [0]
.loc180
# 4 int c,d ;
# 5 c = b*60;
# 6 d = c+44;
# 7
# 8 for (i = 0; i< b; i++)
movl %edi,%ecx # [0]
movl 32(%esp),%ebx # [0] x
movl %ecx,0(%esp) # [1] _temp_gra_spill0
imull $60,%ebp # [1]
movl 40(%esp),%eax # [1] a
xorl %edx,%edx # [2]
.p2align 5,,31
.Lt_0_3586:
#<loop> Loop body line 8, nesting depth: 1, estimated iterations: 1000
.loc1110
# 9 {
# 10 x = x+d;
# 11 *a++=x;
addl $1,%edx # [0]
.loc1100
addl %ebp,%ebx # [0]
.loc1110
addl $4,%eax # [0]
.loc1100
addl $44,%ebx # [1]
.loc1110
cmpl %edi,%edx # [1]
movl %ebx,-4(%eax) # [2] id:17
jl .Lt_0_3586 # [2]
.Lt_0_4098:
.loc1130
# 12 }
# 13 return x;
movl %ebx,%eax # [0]
addl $16,%esp # [0]
popl %edi # [1]
popl %ebx # [4]
popl %ebp # [7]
ret # [7]
.p2align 5,,31
.Lt_0_2818:
.loc1110
movl 32(%esp),%eax # [0] x
.loc1130
addl $16,%esp # [0]
popl %edi # [1]
popl %ebx # [4]
popl %ebp # [7]
ret # [7]
.LDWend_foo:
.size foo, .LDWend_foo-foo
.section .text
.align4
.section .eh_frame, "a",@progbits
.LEHCIE:
.4byte.LEHCIE_end - .LEHCIE_begin
.LEHCIE_begin:
.4byte 0x0
.byte0x01, 0x00, 0x01, 0x7c, 0x08, 0x0c, 0x04, 0x04
.byte0x88, 0x01
.align 4
.LEHCIE_end:
.section .debug_line, ""
.section.note.GNU-stack,"",@progbits
.ident"#Open64 Compiler Version 5.0 : x.c compiled with : -O3
-OPT:unroll_times_max=1 -march=barcelona -msse2 -msse3 -mno-3dnow
-mno-sse4a -mno-ssse3 -mno-sse41 -mno-sse42 -mno-aes -mno-pclmul
-mno-avx -mno-xop -mno-fma -mno-fma4 -m32"
.text
.align2
.section .text
.p2align 5,,
# Program Unit: foo
.globlfoo
.typefoo, @function
foo:# 0x0
# .frame%esp, 16, %esp
# _temp_gra_spill0 = 0
.loc120
# 1 int foo(int x, int b, int *__restrict a)
# 2 {
.LBB1_foo:
pushl %ebp # [0]
pushl %ebx # [3]
pushl %edi # [6]
addl $-16,%esp # [9]
movl 36(%esp),%edi # [10] b
leal -1(%edi),%eax # [13]
testl %eax,%eax # [14]
jl .Lt_0_2818 # [15]
.LBB2_foo:
movl %edi,%ebp # [0]
.loc180
# 4 int c,d ;
# 5 c = b*60;
# 6 d = c+44;
# 7
# 8 for (i = 0; i< b; i++)
movl %edi,%ecx # [0]
movl 32(%esp),%ebx # [0] x
movl %ecx,0(%esp) # [1] _temp_gra_spill0
imull $60,%ebp # [1]
movl 40(%esp),%eax # [1] a
xorl %edx,%edx # [2]
.p2align 5,,31
.Lt_0_3586:
#<loop> Loop body line 8, nesting depth: 1, estimated iterations: 1000
.loc1110
# 9 {
# 10 x = x+d;
# 11 *a++=x;
addl $1,%edx # [0]
.loc1100
addl %ebp,%ebx # [0]
.loc1110
addl $4,%eax # [0]
.loc1100
addl $44,%ebx # [1]
.loc1110
cmpl %edi,%edx # [1]
movl %ebx,-4(%eax) # [2] id:17
jl .Lt_0_3586 # [2]
.Lt_0_4098:
.loc1130
# 12 }
# 13 return x;
movl %ebx,%eax # [0]
addl $16,%esp # [0]
popl %edi # [1]
popl %ebx # [4]
popl %ebp # [7]
ret # [7]
.p2align 5,,31
.Lt_0_2818:
.loc1110
movl 32(%esp),%eax # [0] x
.loc1130
addl $16,%esp # [0]
popl %edi # [1]
popl %ebx # [4]
popl %ebp # [7]
ret # [7]
.LDWend_foo:
.size foo, .LDWend_foo-foo
.section .text
.align4
.section .eh_frame, "a",@progbits
.LEHCIE:
.4byte.LEHCIE_end - .LEHCIE_begin
.LEHCIE_begin:
.4byte 0x0
.byte0x01, 0x00, 0x01, 0x7c, 0x08, 0x0c, 0x04, 0x04
.byte0x88, 0x01
.align 4
.LEHCIE_end:
.section .debug_line, ""
.section.note.GNU-stack,"",@progbits
.ident"#Open64 Compiler Version 5.0 : x.c compiled with : -O3
-OPT:unroll_times_max=1 -march=barcelona -msse2 -msse3 -mno-3dnow
-mno-sse4a -mno-ssse3 -mno-sse41 -mno-sse42 -mno-aes -mno-pclmul
-mno-avx -mno-xop -mno-fma -mno-fma4 -m32"
------------------------------------------------------------------------------
Live Security Virtual Conference
Exclusive live event will cover all the ways today's security and
threat landscape has changed and how IT managers can respond. Discussions
will include endpoint security, mobile security and the latest in malware
threats. http://www.accelacomm.com/jaw/sfrnl04242012/114/50122263/
_______________________________________________
Open64-devel mailing list
Open64-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/open64-devel