Oops -- I see the problem; I used "-s" instead of "-S". So that old output I sent you was kinda meaningless; it was from some other test. Sorry! :-(
But your test is much smaller and easier to check, so let's do that: ----- [7:49] svbu-mpi:~/tmp % cat george2.c #include <stdint.h> #define SMPLOCK "lock;" static inline int opal_atomic_cmpset_32( volatile int32_t *addr, int32_t oldval, int32_t newval) { unsigned char ret; __asm__ __volatile__ ( SMPLOCK "cmpxchgl %1,%2 \n\t" "sete %0 \n\t" : "=qm" (ret) : "q"(newval), "m"(*addr), "a"(oldval) : "memory"); return (int)ret; } int main(int argc, char* argv[] ) { int32_t value = 0, oldval = 0, delta = 1; int32_t* addr = &value; do { oldval = *addr; } while (0 == opal_atomic_cmpset_32(addr, oldval, oldval + delta)); return (oldval + delta); } [7:49] svbu-mpi:~/tmp % pgcc -O3 -c -S george2.c [7:49] svbu-mpi:~/tmp % cat george2.s .file "george2.c" .version "01.01" ## PGC 7.0 -opt 3 ## PGC 06/08/2010 07:49:33 ## pgcc george2.c -O3 -c -S ## /opt/pgi/7.0.7/linux86-64/7.0-7/bin/pgc ## george2.c -opt 3 -terse 1 -inform warn -x 119 0xa10000 -x 122 0x40 -x 123 0x1000 ## -x 127 4 -x 127 16 -x 19 0x400000 -x 28 0x40000 -x 70 0x8000 -x 122 1 -quad ## -x 59 4 -x 59 4 -tp p7-64 -astype 0 -stdinc /opt/pgi/7.0.7/linux86-64/7.0-7/include:/usr/local/include:/usr/lib/gcc/x86_64-redhat-linux/4.1.2/include:/usr/lib/gcc/x86_64-redhat-linux/4.1.2/include:/usr/include ## -def unix -def __unix -def __unix__ -def linux -def __linux -def __linux__ ## -def __NO_MATH_INLINES -def __x86_64__ -def __LONG_MAX__=9223372036854775807L ## -def __SIZE_TYPE__=unsigned long int -def __PTRDIFF_TYPE__=long int -def __THROW= ## -def __extension__= -def __amd64__ -def __SSE__ -def __MMX__ -def __SSE2__ ## -def __SSE3__ -predicate #machine(x86_64) #lint(off) #system(posix) #cpu(x86_64) ## -cmdline +pgcc george2.c -O3 -c -S -x 123 4 -x 123 0x80000000 -alwaysinline /opt/pgi/7.0.7/linux86-64/7.0-7/lib/libintrinsics.il 4 ## -asm george2.s ## lineno: 7 .text .align 16 opal_atomic_cmpset_32: ..Dcfb0: pushq %rbp ..Dcfi0: movq %rsp, %rbp ..Dcfi1: movl %esi, -4(%rbp) movl %edx, -8(%rbp) ..EN1: ## lineno: 16 movl -8(%rbp), %edx movl -4(%rbp), %eax lock;cmpxchgl %edx,(%rdi) sete %cl movb %cl, -9(%rbp) ## lineno: 17 movzbl -9(%rbp), %eax ## lineno: 0 popq %rbp ret .type opal_atomic_cmpset_32,@function .size opal_atomic_cmpset_32,.-opal_atomic_cmpset_32 ..Dcfe0: __opal_atomic_cmpset_32END: .section .pgi_trace .align 8 .quad opal_atomic_cmpset_32 ## address of routine .quad __opal_atomic_cmpset_32END - opal_atomic_cmpset_32 ## size of routine .2byte 0 ## flags for future use .2byte 21 ## length of following string ## name:opal_atomic_cmpset_32: .byte 0x6f,0x70,0x61,0x6c,0x5f,0x61,0x74,0x6f,0x6d,0x69,0x63 .byte 0x5f,0x63,0x6d,0x70,0x73,0x65,0x74,0x5f,0x33,0x32,0x00 ## lineno: 20 .text .align 16 .globl main main: ..Dcfb1: pushq %rbp ..Dcfi2: movq %rsp, %rbp ..Dcfi3: subq $16, %rsp movq %rbx, -16(%rbp) pushq %rax pushq %rax stmxcsr (%rsp) popq %rax orq $64, %rax pushq %rax ldmxcsr (%rsp) popq %rax popq %rax ## lineno: 0 ..EN2: ## lineno: 21 movl $0, -4(%rbp) .align 8 .LB191: ## lineno: 24 movl -4(%rbp), %ebx leaq -4(%rbp), %rdi leal 1(%rbx), %edx movl %ebx, %esi call opal_atomic_cmpset_32 testl %eax, %eax je .LB191 ## lineno: 27 addl $1, %ebx movl %ebx, %eax ## lineno: 28 movq -16(%rbp), %rbx leave ret .type main,@function .size main,.-main ..Dcfe1: __mainEND: .section .pgi_trace .align 8 .quad main ## address of routine .quad __mainEND - main ## size of routine .2byte 0 ## flags for future use .2byte 4 ## length of following string ## name:main: .byte 0x6d,0x61,0x69,0x6e,0x00 .data .section .debug_frame ..Dcieb0: .4byte ..Dciee0-..Dcieb0-4 ## CIE length .4byte 0xffffffff ## CIE ID .byte 0x1 ## CIE version .byte 0x0 ## no augmentation .byte 0x1 ## ULEB128 1, code alignment factor .byte 0x78 ## SLEB128 -8, data alignment factor .byte 0x10 ## return address column .byte 0xc ## DW_CFA_def_cfa (col 7) .byte 0x7 ## ULEB128 7 .byte 0x8 ## ULEB128 8 .byte 0x90 ## DW_CFA_offset (col 16) .byte 0x1 ## ULEB128 1 .align 8 ..Dciee0: .4byte ..Dfdee0-..Dfdeb0 ## FDE length ..Dfdeb0: .4byte ..Dcieb0 ## CIE pointer .quad ..Dcfb0 ## initial location .quad ..Dcfe0-..Dcfb0 ## address range .byte 0x4 ## DW_CFA_advance_loc4 .4byte ..Dcfi0-..Dcfb0 .byte 0xe ## DW_CFA_def_cfa_offset .byte 0x10 ## ULEB128 16 .byte 0x86 ## DW_CFA_offset (col 6) .byte 0x2 ## ULEB128 2 .byte 0x4 ## DW_CFA_advance_loc4 .4byte ..Dcfi1-..Dcfi0 .byte 0xd ## DW_CFA_def_cfa_register (col 6) .byte 0x6 ## ULEB128 6 .align 8 ..Dfdee0: .4byte ..Dfdee1-..Dfdeb1 ## FDE length ..Dfdeb1: .4byte ..Dcieb0 ## CIE pointer .quad ..Dcfb1 ## initial location .quad ..Dcfe1-..Dcfb1 ## address range .byte 0x4 ## DW_CFA_advance_loc4 .4byte ..Dcfi2-..Dcfb1 .byte 0xe ## DW_CFA_def_cfa_offset .byte 0x10 ## ULEB128 16 .byte 0x86 ## DW_CFA_offset (col 6) .byte 0x2 ## ULEB128 2 .byte 0x4 ## DW_CFA_advance_loc4 .4byte ..Dcfi3-..Dcfi2 .byte 0xd ## DW_CFA_def_cfa_register (col 6) .byte 0x6 ## ULEB128 6 .align 8 ..Dfdee1: .ident "PGC 7.0-7" [7:49] svbu-mpi:~/tmp % ----- On Jun 8, 2010, at 10:46 AM, George Bosilca wrote: > It didn't work. Let's try with this small complete application: > > #include <stdint.h> > > #define SMPLOCK "lock;" > > static inline int opal_atomic_cmpset_32( volatile int32_t *addr, > int32_t oldval, int32_t newval) > { > unsigned char ret; > __asm__ __volatile__ ( > SMPLOCK "cmpxchgl %1,%2 \n\t" > "sete %0 \n\t" > : "=qm" (ret) > : "q"(newval), "m"(*addr), "a"(oldval) > : "memory"); > > return (int)ret; > } > > int main(int argc, char* argv[] ) > { > int32_t value = 0, oldval = 0, delta = 1; > int32_t* addr = &value; > > do { > oldval = *addr; > } while (0 == opal_atomic_cmpset_32(addr, oldval, oldval + delta)); > return (oldval + delta); > } > > > > Thanks, > george. > > > On Jun 8, 2010, at 14:42 , Jeff Squyres wrote: > >> Look at my output -- I did... >> >> On Jun 8, 2010, at 10:40 AM, George Bosilca wrote: >> >>> Still no good, the opal_atomic_cmpset_32 is not inlined. Try to add -O3 to >>> your command line, this helped for gcc. >>> >>> Thanks, >>> george. >>> >>> On Jun 8, 2010, at 14:14 , Jeff Squyres wrote: >>> >>>> On Jun 8, 2010, at 9:53 AM, George Bosilca wrote: >>>> >>>>> As you can see there is no explicit call, the opal_atomic_cmpset_32 is >>>>> really inlined. I think the problem is that you didn't specify the -O3 >>>>> flag on your command line. >>>> >>>> Ah, you wanted me to compile the OMPI code itself and send you the >>>> assembly. That's not what you asked for. :-) >>>> >>>> (I just took the code you sent in the mail, stuffed it into george.c, and >>>> compiled that with -s -- outside of the context of the Open MPI code tree) >>>> >>>> Here's the new output. It still didn't inline, but you can see the code >>>> for the _cmpset function: >>>> >>>> ----- >>>> [7:13] svbu-mpi:~/tmp % cat george.c >>>> >>>> #include <stdint.h> >>>> >>>> #include "opal/sys/atomic.h" >>>> >>>> int foo(void) { >>>> int32_t oldval, delta; >>>> int32_t *addr = 0; >>>> do { >>>> oldval = *addr; >>>> } while (0 == opal_atomic_cmpset_32(addr, oldval, oldval + delta)); >>>> return (oldval + delta); >>>> } >>>> >>>> [7:13] svbu-mpi:~/tmp % pgcc -O3 -I /home/jsquyres/svn/ompi4 >>>> -I/home/jsquyres/svn/ompi4/opal/include -c -s george.c >>>> [7:13] svbu-mpi:~/tmp % cat george.s >>>> .file "george.c" >>>> .version "01.01" >>>> ## PGC 7.0 -opt 1 >>>> ## PGC 06/08/2010 05:10:04 >>>> ## pgcc george.c -c -S >>>> ## /opt/pgi/7.0.7/linux86-64/7.0-7/bin/pgc >>>> ## george.c -opt 1 -terse 1 -inform warn -x 119 0xa10000 -x 122 0x40 -x >>>> 123 0x1000 >>>> ## -x 127 4 -x 127 16 -x 19 0x400000 -x 28 0x40000 -x 70 0x8000 -x 122 1 >>>> -quad >>>> ## -x 59 4 -x 59 4 -tp p7-64 -astype 0 -stdinc >>>> /opt/pgi/7.0.7/linux86-64/7.0-7/include:/usr/local/include:/usr/lib/gcc/x86_64-redhat-linux/4.1.2/include:/usr/lib/gcc/x86_64-redhat-linux/4.1.2/include:/usr/include >>>> ## -def unix -def __unix -def __unix__ -def linux -def __linux -def >>>> __linux__ >>>> ## -def __NO_MATH_INLINES -def __x86_64__ -def >>>> __LONG_MAX__=9223372036854775807L >>>> ## -def __SIZE_TYPE__=unsigned long int -def __PTRDIFF_TYPE__=long int >>>> -def __THROW= >>>> ## -def __extension__= -def __amd64__ -def __SSE__ -def __MMX__ -def >>>> __SSE2__ >>>> ## -def __SSE3__ -predicate #machine(x86_64) #lint(off) #system(posix) >>>> #cpu(x86_64) >>>> ## -cmdline +pgcc george.c -c -S -x 123 4 -x 123 0x80000000 -alwaysinline >>>> /opt/pgi/7.0.7/linux86-64/7.0-7/lib/libintrinsics.il 4 >>>> ## -asm george.s >>>> ## lineno: 3 >>>> .text >>>> .align 16 >>>> .globl foo >>>> foo: >>>> ..Dcfb0: >>>> pushq %rbp >>>> ..Dcfi0: >>>> movq %rsp, %rbp >>>> ..Dcfi1: >>>> subq $16, %rsp >>>> ..EN1: >>>> ## lineno: 5 >>>> movq $0, -8(%rbp) >>>> .p2align 4,,3 >>>> .LB157: >>>> ## lineno: 6 >>>> movq -8(%rbp), %rdi >>>> movl (%rdi), %esi >>>> movl %esi, -12(%rbp) >>>> movl -16(%rbp), %edx >>>> addl %esi, %edx >>>> xorl %eax, %eax >>>> call opal_atomic_cmpset_32 >>>> testl %eax, %eax >>>> je .LB157 >>>> movl -16(%rbp), %eax >>>> addl -12(%rbp), %eax >>>> ## lineno: 10 >>>> leave >>>> ret >>>> .type foo,@function >>>> .size foo,.-foo >>>> ..Dcfe0: >>>> __fooEND: >>>> .section .pgi_trace >>>> .align 8 >>>> .quad foo ## address of routine >>>> .quad __fooEND - foo ## size of routine >>>> .2byte 0 ## flags for future use >>>> .2byte 3 ## length of following string >>>> ## name:foo: >>>> .byte 0x66,0x6f,0x6f,0x00 >>>> .data >>>> .globl opal_atomic_cmpset_32 >>>> .section .debug_frame >>>> ..Dcieb0: >>>> .4byte ..Dciee0-..Dcieb0-4 ## CIE length >>>> .4byte 0xffffffff ## CIE ID >>>> .byte 0x1 ## CIE version >>>> .byte 0x0 ## no augmentation >>>> .byte 0x1 ## ULEB128 1, code alignment factor >>>> .byte 0x78 ## SLEB128 -8, data alignment factor >>>> .byte 0x10 ## return address column >>>> .byte 0xc ## DW_CFA_def_cfa (col 7) >>>> .byte 0x7 ## ULEB128 7 >>>> .byte 0x8 ## ULEB128 8 >>>> .byte 0x90 ## DW_CFA_offset (col 16) >>>> .byte 0x1 ## ULEB128 1 >>>> .align 8 >>>> ..Dciee0: >>>> .4byte ..Dfdee0-..Dfdeb0 ## FDE length >>>> ..Dfdeb0: >>>> .4byte ..Dcieb0 ## CIE pointer >>>> .quad ..Dcfb0 ## initial location >>>> .quad ..Dcfe0-..Dcfb0 ## address range >>>> .byte 0x4 ## DW_CFA_advance_loc4 >>>> .4byte ..Dcfi0-..Dcfb0 >>>> .byte 0xe ## DW_CFA_def_cfa_offset >>>> .byte 0x10 ## ULEB128 16 >>>> .byte 0x86 ## DW_CFA_offset (col 6) >>>> .byte 0x2 ## ULEB128 2 >>>> .byte 0x4 ## DW_CFA_advance_loc4 >>>> .4byte ..Dcfi1-..Dcfi0 >>>> .byte 0xd ## DW_CFA_def_cfa_register (col 6) >>>> .byte 0x6 ## ULEB128 6 >>>> .align 8 >>>> ..Dfdee0: >>>> .ident "PGC 7.0-7" >>>> [7:13] svbu-mpi:~/tmp % >>>> ----- >>>> >>>> -- >>>> Jeff Squyres >>>> jsquy...@cisco.com >>>> For corporate legal information go to: >>>> http://www.cisco.com/web/about/doing_business/legal/cri/ >>>> >>>> >>>> _______________________________________________ >>>> devel mailing list >>>> de...@open-mpi.org >>>> http://www.open-mpi.org/mailman/listinfo.cgi/devel >>> >>> >>> _______________________________________________ >>> devel mailing list >>> de...@open-mpi.org >>> http://www.open-mpi.org/mailman/listinfo.cgi/devel >>> >> >> >> -- >> Jeff Squyres >> jsquy...@cisco.com >> For corporate legal information go to: >> http://www.cisco.com/web/about/doing_business/legal/cri/ >> >> >> _______________________________________________ >> devel mailing list >> de...@open-mpi.org >> http://www.open-mpi.org/mailman/listinfo.cgi/devel > > > _______________________________________________ > devel mailing list > de...@open-mpi.org > http://www.open-mpi.org/mailman/listinfo.cgi/devel -- Jeff Squyres jsquy...@cisco.com For corporate legal information go to: http://www.cisco.com/web/about/doing_business/legal/cri/