Look at my output -- I did... On Jun 8, 2010, at 10:40 AM, George Bosilca wrote:
> Still no good, the opal_atomic_cmpset_32 is not inlined. Try to add -O3 to > your command line, this helped for gcc. > > Thanks, > george. > > On Jun 8, 2010, at 14:14 , Jeff Squyres wrote: > > > On Jun 8, 2010, at 9:53 AM, George Bosilca wrote: > > > >> As you can see there is no explicit call, the opal_atomic_cmpset_32 is > >> really inlined. I think the problem is that you didn't specify the -O3 > >> flag on your command line. > > > > Ah, you wanted me to compile the OMPI code itself and send you the > > assembly. That's not what you asked for. :-) > > > > (I just took the code you sent in the mail, stuffed it into george.c, and > > compiled that with -s -- outside of the context of the Open MPI code tree) > > > > Here's the new output. It still didn't inline, but you can see the code > > for the _cmpset function: > > > > ----- > > [7:13] svbu-mpi:~/tmp % cat george.c > > > > #include <stdint.h> > > > > #include "opal/sys/atomic.h" > > > > int foo(void) { > > int32_t oldval, delta; > > int32_t *addr = 0; > > do { > > oldval = *addr; > > } while (0 == opal_atomic_cmpset_32(addr, oldval, oldval + delta)); > > return (oldval + delta); > > } > > > > [7:13] svbu-mpi:~/tmp % pgcc -O3 -I /home/jsquyres/svn/ompi4 > > -I/home/jsquyres/svn/ompi4/opal/include -c -s george.c > > [7:13] svbu-mpi:~/tmp % cat george.s > > .file "george.c" > > .version "01.01" > > ## PGC 7.0 -opt 1 > > ## PGC 06/08/2010 05:10:04 > > ## pgcc george.c -c -S > > ## /opt/pgi/7.0.7/linux86-64/7.0-7/bin/pgc > > ## george.c -opt 1 -terse 1 -inform warn -x 119 0xa10000 -x 122 0x40 -x 123 > > 0x1000 > > ## -x 127 4 -x 127 16 -x 19 0x400000 -x 28 0x40000 -x 70 0x8000 -x 122 1 > > -quad > > ## -x 59 4 -x 59 4 -tp p7-64 -astype 0 -stdinc > > /opt/pgi/7.0.7/linux86-64/7.0-7/include:/usr/local/include:/usr/lib/gcc/x86_64-redhat-linux/4.1.2/include:/usr/lib/gcc/x86_64-redhat-linux/4.1.2/include:/usr/include > > ## -def unix -def __unix -def __unix__ -def linux -def __linux -def > > __linux__ > > ## -def __NO_MATH_INLINES -def __x86_64__ -def > > __LONG_MAX__=9223372036854775807L > > ## -def __SIZE_TYPE__=unsigned long int -def __PTRDIFF_TYPE__=long int -def > > __THROW= > > ## -def __extension__= -def __amd64__ -def __SSE__ -def __MMX__ -def > > __SSE2__ > > ## -def __SSE3__ -predicate #machine(x86_64) #lint(off) #system(posix) > > #cpu(x86_64) > > ## -cmdline +pgcc george.c -c -S -x 123 4 -x 123 0x80000000 -alwaysinline > > /opt/pgi/7.0.7/linux86-64/7.0-7/lib/libintrinsics.il 4 > > ## -asm george.s > > ## lineno: 3 > > .text > > .align 16 > > .globl foo > > foo: > > ..Dcfb0: > > pushq %rbp > > ..Dcfi0: > > movq %rsp, %rbp > > ..Dcfi1: > > subq $16, %rsp > > ..EN1: > > ## lineno: 5 > > movq $0, -8(%rbp) > > .p2align 4,,3 > > .LB157: > > ## lineno: 6 > > movq -8(%rbp), %rdi > > movl (%rdi), %esi > > movl %esi, -12(%rbp) > > movl -16(%rbp), %edx > > addl %esi, %edx > > xorl %eax, %eax > > call opal_atomic_cmpset_32 > > testl %eax, %eax > > je .LB157 > > movl -16(%rbp), %eax > > addl -12(%rbp), %eax > > ## lineno: 10 > > leave > > ret > > .type foo,@function > > .size foo,.-foo > > ..Dcfe0: > > __fooEND: > > .section .pgi_trace > > .align 8 > > .quad foo ## address of routine > > .quad __fooEND - foo ## size of routine > > .2byte 0 ## flags for future use > > .2byte 3 ## length of following string > > ## name:foo: > > .byte 0x66,0x6f,0x6f,0x00 > > .data > > .globl opal_atomic_cmpset_32 > > .section .debug_frame > > ..Dcieb0: > > .4byte ..Dciee0-..Dcieb0-4 ## CIE length > > .4byte 0xffffffff ## CIE ID > > .byte 0x1 ## CIE version > > .byte 0x0 ## no augmentation > > .byte 0x1 ## ULEB128 1, code alignment factor > > .byte 0x78 ## SLEB128 -8, data alignment factor > > .byte 0x10 ## return address column > > .byte 0xc ## DW_CFA_def_cfa (col 7) > > .byte 0x7 ## ULEB128 7 > > .byte 0x8 ## ULEB128 8 > > .byte 0x90 ## DW_CFA_offset (col 16) > > .byte 0x1 ## ULEB128 1 > > .align 8 > > ..Dciee0: > > .4byte ..Dfdee0-..Dfdeb0 ## FDE length > > ..Dfdeb0: > > .4byte ..Dcieb0 ## CIE pointer > > .quad ..Dcfb0 ## initial location > > .quad ..Dcfe0-..Dcfb0 ## address range > > .byte 0x4 ## DW_CFA_advance_loc4 > > .4byte ..Dcfi0-..Dcfb0 > > .byte 0xe ## DW_CFA_def_cfa_offset > > .byte 0x10 ## ULEB128 16 > > .byte 0x86 ## DW_CFA_offset (col 6) > > .byte 0x2 ## ULEB128 2 > > .byte 0x4 ## DW_CFA_advance_loc4 > > .4byte ..Dcfi1-..Dcfi0 > > .byte 0xd ## DW_CFA_def_cfa_register (col 6) > > .byte 0x6 ## ULEB128 6 > > .align 8 > > ..Dfdee0: > > .ident "PGC 7.0-7" > > [7:13] svbu-mpi:~/tmp % > > ----- > > > > -- > > Jeff Squyres > > jsquy...@cisco.com > > For corporate legal information go to: > > http://www.cisco.com/web/about/doing_business/legal/cri/ > > > > > > _______________________________________________ > > devel mailing list > > de...@open-mpi.org > > http://www.open-mpi.org/mailman/listinfo.cgi/devel > > > _______________________________________________ > devel mailing list > de...@open-mpi.org > http://www.open-mpi.org/mailman/listinfo.cgi/devel > -- Jeff Squyres jsquy...@cisco.com For corporate legal information go to: http://www.cisco.com/web/about/doing_business/legal/cri/