On Jun 8, 2010, at 14:49 , Jeff Squyres wrote: > ## -cmdline +pgcc george2.c -O3 -c -S -x 123 4 -x 123 0x80000000 > -alwaysinline /opt/pgi/7.0.7/linux86-64/7.0-7/lib/libintrinsics.il 4 > ## -asm george2.s > ## lineno: 7 > .text > .align 16 > opal_atomic_cmpset_32: > ..Dcfb0: > pushq %rbp > ..Dcfi0: > movq %rsp, %rbp > ..Dcfi1: > movl %esi, -4(%rbp) > movl %edx, -8(%rbp) > ..EN1: > ## lineno: 16 > movl -8(%rbp), %edx > movl -4(%rbp), %eax
oldval is moved into the %eax ... once > lock;cmpxchgl %edx,(%rdi) > sete %cl The CCR is retrieved > movb %cl, -9(%rbp) And stored. But the loop disappeared. I really have a doubt about the correctness of this assembly code. george. > ## lineno: 17 > movzbl -9(%rbp), %eax > ## lineno: 0 > popq %rbp > ret > .type opal_atomic_cmpset_32,@function > .size opal_atomic_cmpset_32,.-opal_atomic_cmpset_32 > ..Dcfe0: > __opal_atomic_cmpset_32END: > .section .pgi_trace > .align 8 > .quad opal_atomic_cmpset_32 ## address of routine > .quad __opal_atomic_cmpset_32END - opal_atomic_cmpset_32 ## size > of routine > .2byte 0 ## flags for future use > .2byte 21 ## length of following string > ## name:opal_atomic_cmpset_32: > .byte 0x6f,0x70,0x61,0x6c,0x5f,0x61,0x74,0x6f,0x6d,0x69,0x63 > .byte 0x5f,0x63,0x6d,0x70,0x73,0x65,0x74,0x5f,0x33,0x32,0x00 > ## lineno: 20 > .text > .align 16 > .globl main > main: > ..Dcfb1: > pushq %rbp > ..Dcfi2: > movq %rsp, %rbp > ..Dcfi3: > subq $16, %rsp > movq %rbx, -16(%rbp) > pushq %rax > pushq %rax > stmxcsr (%rsp) > popq %rax > orq $64, %rax > pushq %rax > ldmxcsr (%rsp) > popq %rax > popq %rax > ## lineno: 0 > ..EN2: > ## lineno: 21 > movl $0, -4(%rbp) > .align 8 > .LB191: > ## lineno: 24 > movl -4(%rbp), %ebx > leaq -4(%rbp), %rdi > leal 1(%rbx), %edx > movl %ebx, %esi > call opal_atomic_cmpset_32 > testl %eax, %eax > je .LB191 > ## lineno: 27 > addl $1, %ebx > movl %ebx, %eax > ## lineno: 28 > movq -16(%rbp), %rbx > leave > ret > .type main,@function > .size main,.-main > ..Dcfe1: > __mainEND: > .section .pgi_trace > .align 8 > .quad main ## address of routine > .quad __mainEND - main ## size of routine > .2byte 0 ## flags for future use > .2byte 4 ## length of following string > ## name:main: > .byte 0x6d,0x61,0x69,0x6e,0x00 > .data > .section .debug_frame > ..Dcieb0: > .4byte ..Dciee0-..Dcieb0-4 ## CIE length > .4byte 0xffffffff ## CIE ID > .byte 0x1 ## CIE version > .byte 0x0 ## no augmentation > .byte 0x1 ## ULEB128 1, code alignment factor > .byte 0x78 ## SLEB128 -8, data alignment factor > .byte 0x10 ## return address column > .byte 0xc ## DW_CFA_def_cfa (col 7) > .byte 0x7 ## ULEB128 7 > .byte 0x8 ## ULEB128 8 > .byte 0x90 ## DW_CFA_offset (col 16) > .byte 0x1 ## ULEB128 1 > .align 8 > ..Dciee0: > .4byte ..Dfdee0-..Dfdeb0 ## FDE length > ..Dfdeb0: > .4byte ..Dcieb0 ## CIE pointer > .quad ..Dcfb0 ## initial location > .quad ..Dcfe0-..Dcfb0 ## address range > .byte 0x4 ## DW_CFA_advance_loc4 > .4byte ..Dcfi0-..Dcfb0 > .byte 0xe ## DW_CFA_def_cfa_offset > .byte 0x10 ## ULEB128 16 > .byte 0x86 ## DW_CFA_offset (col 6) > .byte 0x2 ## ULEB128 2 > .byte 0x4 ## DW_CFA_advance_loc4 > .4byte ..Dcfi1-..Dcfi0 > .byte 0xd ## DW_CFA_def_cfa_register (col 6) > .byte 0x6 ## ULEB128 6 > .align 8 > ..Dfdee0: > .4byte ..Dfdee1-..Dfdeb1 ## FDE length > ..Dfdeb1: > .4byte ..Dcieb0 ## CIE pointer > .quad ..Dcfb1 ## initial location > .quad ..Dcfe1-..Dcfb1 ## address range > .byte 0x4 ## DW_CFA_advance_loc4 > .4byte ..Dcfi2-..Dcfb1 > .byte 0xe ## DW_CFA_def_cfa_offset > .byte 0x10 ## ULEB128 16 > .byte 0x86 ## DW_CFA_offset (col 6) > .byte 0x2 ## ULEB128 2 > .byte 0x4 ## DW_CFA_advance_loc4 > .4byte ..Dcfi3-..Dcfi2 > .byte 0xd ## DW_CFA_def_cfa_register (col 6) > .byte 0x6 ## ULEB128 6 > .align 8 > ..Dfdee1: > .ident "PGC 7.0-7" > [7:49] svbu-mpi:~/tmp % > > ----- > > > > On Jun 8, 2010, at 10:46 AM, George Bosilca wrote: > >> It didn't work. Let's try with this small complete application: >> >> #include <stdint.h> >> >> #define SMPLOCK "lock;" >> >> static inline int opal_atomic_cmpset_32( volatile int32_t *addr, >> int32_t oldval, int32_t newval) >> { >> unsigned char ret; >> __asm__ __volatile__ ( >> SMPLOCK "cmpxchgl %1,%2 \n\t" >> "sete %0 \n\t" >> : "=qm" (ret) >> : "q"(newval), "m"(*addr), "a"(oldval) >> : "memory"); >> >> return (int)ret; >> } >> >> int main(int argc, char* argv[] ) >> { >> int32_t value = 0, oldval = 0, delta = 1; >> int32_t* addr = &value; >> >> do { >> oldval = *addr; >> } while (0 == opal_atomic_cmpset_32(addr, oldval, oldval + delta)); >> return (oldval + delta); >> } >> >> >> >> Thanks, >> george. >> >> >> On Jun 8, 2010, at 14:42 , Jeff Squyres wrote: >> >>> Look at my output -- I did... >>> >>> On Jun 8, 2010, at 10:40 AM, George Bosilca wrote: >>> >>>> Still no good, the opal_atomic_cmpset_32 is not inlined. Try to add -O3 to >>>> your command line, this helped for gcc. >>>> >>>> Thanks, >>>> george. >>>> >>>> On Jun 8, 2010, at 14:14 , Jeff Squyres wrote: >>>> >>>>> On Jun 8, 2010, at 9:53 AM, George Bosilca wrote: >>>>> >>>>>> As you can see there is no explicit call, the opal_atomic_cmpset_32 is >>>>>> really inlined. I think the problem is that you didn't specify the -O3 >>>>>> flag on your command line. >>>>> >>>>> Ah, you wanted me to compile the OMPI code itself and send you the >>>>> assembly. That's not what you asked for. :-) >>>>> >>>>> (I just took the code you sent in the mail, stuffed it into george.c, and >>>>> compiled that with -s -- outside of the context of the Open MPI code tree) >>>>> >>>>> Here's the new output. It still didn't inline, but you can see the code >>>>> for the _cmpset function: >>>>> >>>>> ----- >>>>> [7:13] svbu-mpi:~/tmp % cat george.c >>>>> >>>>> #include <stdint.h> >>>>> >>>>> #include "opal/sys/atomic.h" >>>>> >>>>> int foo(void) { >>>>> int32_t oldval, delta; >>>>> int32_t *addr = 0; >>>>> do { >>>>> oldval = *addr; >>>>> } while (0 == opal_atomic_cmpset_32(addr, oldval, oldval + delta)); >>>>> return (oldval + delta); >>>>> } >>>>> >>>>> [7:13] svbu-mpi:~/tmp % pgcc -O3 -I /home/jsquyres/svn/ompi4 >>>>> -I/home/jsquyres/svn/ompi4/opal/include -c -s george.c >>>>> [7:13] svbu-mpi:~/tmp % cat george.s >>>>> .file "george.c" >>>>> .version "01.01" >>>>> ## PGC 7.0 -opt 1 >>>>> ## PGC 06/08/2010 05:10:04 >>>>> ## pgcc george.c -c -S >>>>> ## /opt/pgi/7.0.7/linux86-64/7.0-7/bin/pgc >>>>> ## george.c -opt 1 -terse 1 -inform warn -x 119 0xa10000 -x 122 0x40 -x >>>>> 123 0x1000 >>>>> ## -x 127 4 -x 127 16 -x 19 0x400000 -x 28 0x40000 -x 70 0x8000 -x 122 1 >>>>> -quad >>>>> ## -x 59 4 -x 59 4 -tp p7-64 -astype 0 -stdinc >>>>> /opt/pgi/7.0.7/linux86-64/7.0-7/include:/usr/local/include:/usr/lib/gcc/x86_64-redhat-linux/4.1.2/include:/usr/lib/gcc/x86_64-redhat-linux/4.1.2/include:/usr/include >>>>> ## -def unix -def __unix -def __unix__ -def linux -def __linux -def >>>>> __linux__ >>>>> ## -def __NO_MATH_INLINES -def __x86_64__ -def >>>>> __LONG_MAX__=9223372036854775807L >>>>> ## -def __SIZE_TYPE__=unsigned long int -def __PTRDIFF_TYPE__=long int >>>>> -def __THROW= >>>>> ## -def __extension__= -def __amd64__ -def __SSE__ -def __MMX__ -def >>>>> __SSE2__ >>>>> ## -def __SSE3__ -predicate #machine(x86_64) #lint(off) #system(posix) >>>>> #cpu(x86_64) >>>>> ## -cmdline +pgcc george.c -c -S -x 123 4 -x 123 0x80000000 -alwaysinline >>>>> /opt/pgi/7.0.7/linux86-64/7.0-7/lib/libintrinsics.il 4 >>>>> ## -asm george.s >>>>> ## lineno: 3 >>>>> .text >>>>> .align 16 >>>>> .globl foo >>>>> foo: >>>>> ..Dcfb0: >>>>> pushq %rbp >>>>> ..Dcfi0: >>>>> movq %rsp, %rbp >>>>> ..Dcfi1: >>>>> subq $16, %rsp >>>>> ..EN1: >>>>> ## lineno: 5 >>>>> movq $0, -8(%rbp) >>>>> .p2align 4,,3 >>>>> .LB157: >>>>> ## lineno: 6 >>>>> movq -8(%rbp), %rdi >>>>> movl (%rdi), %esi >>>>> movl %esi, -12(%rbp) >>>>> movl -16(%rbp), %edx >>>>> addl %esi, %edx >>>>> xorl %eax, %eax >>>>> call opal_atomic_cmpset_32 >>>>> testl %eax, %eax >>>>> je .LB157 >>>>> movl -16(%rbp), %eax >>>>> addl -12(%rbp), %eax >>>>> ## lineno: 10 >>>>> leave >>>>> ret >>>>> .type foo,@function >>>>> .size foo,.-foo >>>>> ..Dcfe0: >>>>> __fooEND: >>>>> .section .pgi_trace >>>>> .align 8 >>>>> .quad foo ## address of routine >>>>> .quad __fooEND - foo ## size of routine >>>>> .2byte 0 ## flags for future use >>>>> .2byte 3 ## length of following string >>>>> ## name:foo: >>>>> .byte 0x66,0x6f,0x6f,0x00 >>>>> .data >>>>> .globl opal_atomic_cmpset_32 >>>>> .section .debug_frame >>>>> ..Dcieb0: >>>>> .4byte ..Dciee0-..Dcieb0-4 ## CIE length >>>>> .4byte 0xffffffff ## CIE ID >>>>> .byte 0x1 ## CIE version >>>>> .byte 0x0 ## no augmentation >>>>> .byte 0x1 ## ULEB128 1, code alignment factor >>>>> .byte 0x78 ## SLEB128 -8, data alignment factor >>>>> .byte 0x10 ## return address column >>>>> .byte 0xc ## DW_CFA_def_cfa (col 7) >>>>> .byte 0x7 ## ULEB128 7 >>>>> .byte 0x8 ## ULEB128 8 >>>>> .byte 0x90 ## DW_CFA_offset (col 16) >>>>> .byte 0x1 ## ULEB128 1 >>>>> .align 8 >>>>> ..Dciee0: >>>>> .4byte ..Dfdee0-..Dfdeb0 ## FDE length >>>>> ..Dfdeb0: >>>>> .4byte ..Dcieb0 ## CIE pointer >>>>> .quad ..Dcfb0 ## initial location >>>>> .quad ..Dcfe0-..Dcfb0 ## address range >>>>> .byte 0x4 ## DW_CFA_advance_loc4 >>>>> .4byte ..Dcfi0-..Dcfb0 >>>>> .byte 0xe ## DW_CFA_def_cfa_offset >>>>> .byte 0x10 ## ULEB128 16 >>>>> .byte 0x86 ## DW_CFA_offset (col 6) >>>>> .byte 0x2 ## ULEB128 2 >>>>> .byte 0x4 ## DW_CFA_advance_loc4 >>>>> .4byte ..Dcfi1-..Dcfi0 >>>>> .byte 0xd ## DW_CFA_def_cfa_register (col 6) >>>>> .byte 0x6 ## ULEB128 6 >>>>> .align 8 >>>>> ..Dfdee0: >>>>> .ident "PGC 7.0-7" >>>>> [7:13] svbu-mpi:~/tmp % >>>>> ----- >>>>> >>>>> -- >>>>> Jeff Squyres >>>>> jsquy...@cisco.com >>>>> For corporate legal information go to: >>>>> http://www.cisco.com/web/about/doing_business/legal/cri/ >>>>> >>>>> >>>>> _______________________________________________ >>>>> devel mailing list >>>>> de...@open-mpi.org >>>>> http://www.open-mpi.org/mailman/listinfo.cgi/devel >>>> >>>> >>>> _______________________________________________ >>>> devel mailing list >>>> de...@open-mpi.org >>>> http://www.open-mpi.org/mailman/listinfo.cgi/devel >>>> >>> >>> >>> -- >>> Jeff Squyres >>> jsquy...@cisco.com >>> For corporate legal information go to: >>> http://www.cisco.com/web/about/doing_business/legal/cri/ >>> >>> >>> _______________________________________________ >>> devel mailing list >>> de...@open-mpi.org >>> http://www.open-mpi.org/mailman/listinfo.cgi/devel >> >> >> _______________________________________________ >> devel mailing list >> de...@open-mpi.org >> http://www.open-mpi.org/mailman/listinfo.cgi/devel > > > -- > Jeff Squyres > jsquy...@cisco.com > For corporate legal information go to: > http://www.cisco.com/web/about/doing_business/legal/cri/ > > > _______________________________________________ > devel mailing list > de...@open-mpi.org > http://www.open-mpi.org/mailman/listinfo.cgi/devel