Oops -- I see the problem; I used "-s" instead of "-S".  So that old output I 
sent you was kinda meaningless; it was from some other test.  Sorry!  :-(

But your test is much smaller and easier to check, so let's do that:

-----
[7:49] svbu-mpi:~/tmp % cat george2.c 
#include <stdint.h>

#define SMPLOCK "lock;"

static inline int opal_atomic_cmpset_32( volatile int32_t *addr,
                                         int32_t oldval, int32_t newval)
{
    unsigned char ret;
    __asm__ __volatile__ (
                      SMPLOCK "cmpxchgl %1,%2   \n\t"
                              "sete     %0      \n\t"
                      : "=qm" (ret)
                      : "q"(newval), "m"(*addr), "a"(oldval)
                      : "memory");

    return (int)ret;
}

int main(int argc, char* argv[] )
{
    int32_t value = 0, oldval = 0, delta = 1;
    int32_t* addr = &value;

    do {
        oldval = *addr;
    } while (0 == opal_atomic_cmpset_32(addr, oldval, oldval + delta));
    return (oldval + delta);
}
[7:49] svbu-mpi:~/tmp % pgcc -O3 -c -S george2.c
[7:49] svbu-mpi:~/tmp % cat george2.s 
        .file   "george2.c"
        .version        "01.01"
## PGC 7.0 -opt 3
## PGC 06/08/2010  07:49:33
## pgcc george2.c -O3 -c -S
## /opt/pgi/7.0.7/linux86-64/7.0-7/bin/pgc
## george2.c -opt 3 -terse 1 -inform warn -x 119 0xa10000 -x 122 0x40 -x 123 
0x1000
## -x 127 4 -x 127 16 -x 19 0x400000 -x 28 0x40000 -x 70 0x8000 -x 122 1 -quad
## -x 59 4 -x 59 4 -tp p7-64 -astype 0 -stdinc 
/opt/pgi/7.0.7/linux86-64/7.0-7/include:/usr/local/include:/usr/lib/gcc/x86_64-redhat-linux/4.1.2/include:/usr/lib/gcc/x86_64-redhat-linux/4.1.2/include:/usr/include
## -def unix -def __unix -def __unix__ -def linux -def __linux -def __linux__
## -def __NO_MATH_INLINES -def __x86_64__ -def __LONG_MAX__=9223372036854775807L
## -def __SIZE_TYPE__=unsigned long int -def __PTRDIFF_TYPE__=long int -def 
__THROW=
## -def __extension__= -def __amd64__ -def __SSE__ -def __MMX__ -def __SSE2__
## -def __SSE3__ -predicate #machine(x86_64) #lint(off) #system(posix) 
#cpu(x86_64)
## -cmdline +pgcc george2.c -O3 -c -S -x 123 4 -x 123 0x80000000 -alwaysinline 
/opt/pgi/7.0.7/linux86-64/7.0-7/lib/libintrinsics.il 4
## -asm george2.s
## lineno: 7
        .text
        .align  16
opal_atomic_cmpset_32:
..Dcfb0:
        pushq   %rbp
..Dcfi0:
        movq    %rsp, %rbp
..Dcfi1:
        movl    %esi, -4(%rbp)
        movl    %edx, -8(%rbp)
..EN1:
## lineno: 16
        movl    -8(%rbp), %edx
        movl    -4(%rbp), %eax
lock;cmpxchgl %edx,(%rdi)   
        sete     %cl      

        movb    %cl, -9(%rbp)
## lineno: 17
        movzbl  -9(%rbp), %eax
## lineno: 0
        popq    %rbp
        ret
        .type   opal_atomic_cmpset_32,@function
        .size   opal_atomic_cmpset_32,.-opal_atomic_cmpset_32
..Dcfe0:
__opal_atomic_cmpset_32END:
        .section        .pgi_trace
        .align  8
        .quad   opal_atomic_cmpset_32   ## address of routine
        .quad   __opal_atomic_cmpset_32END - opal_atomic_cmpset_32      ## size 
of routine
        .2byte  0       ## flags for future use
        .2byte  21      ## length of following string
## name:opal_atomic_cmpset_32:
        .byte   0x6f,0x70,0x61,0x6c,0x5f,0x61,0x74,0x6f,0x6d,0x69,0x63
        .byte   0x5f,0x63,0x6d,0x70,0x73,0x65,0x74,0x5f,0x33,0x32,0x00
## lineno: 20
        .text
        .align  16
        .globl  main
main:
..Dcfb1:
        pushq   %rbp
..Dcfi2:
        movq    %rsp, %rbp
..Dcfi3:
        subq    $16, %rsp
        movq    %rbx, -16(%rbp)
        pushq   %rax
        pushq   %rax
        stmxcsr (%rsp)
        popq    %rax
        orq     $64, %rax
        pushq   %rax
        ldmxcsr (%rsp)
        popq    %rax
        popq    %rax
## lineno: 0
..EN2:
## lineno: 21
        movl    $0, -4(%rbp)
        .align  8
.LB191:
## lineno: 24
        movl    -4(%rbp), %ebx
        leaq    -4(%rbp), %rdi
        leal    1(%rbx), %edx
        movl    %ebx, %esi
        call    opal_atomic_cmpset_32
        testl   %eax, %eax
        je      .LB191
## lineno: 27
        addl    $1, %ebx
        movl    %ebx, %eax
## lineno: 28
        movq    -16(%rbp), %rbx
        leave
        ret
        .type   main,@function
        .size   main,.-main
..Dcfe1:
__mainEND:
        .section        .pgi_trace
        .align  8
        .quad   main    ## address of routine
        .quad   __mainEND - main        ## size of routine
        .2byte  0       ## flags for future use
        .2byte  4       ## length of following string
## name:main:
        .byte   0x6d,0x61,0x69,0x6e,0x00
        .data
        .section        .debug_frame
..Dcieb0:
        .4byte  ..Dciee0-..Dcieb0-4     ## CIE length
        .4byte  0xffffffff      ## CIE ID
        .byte   0x1     ## CIE version
        .byte   0x0     ## no augmentation
        .byte   0x1     ## ULEB128 1, code alignment factor
        .byte   0x78    ## SLEB128 -8, data alignment factor
        .byte   0x10    ## return address column
        .byte   0xc     ## DW_CFA_def_cfa (col 7)
        .byte   0x7     ## ULEB128 7
        .byte   0x8     ## ULEB128 8
        .byte   0x90    ## DW_CFA_offset (col 16)
        .byte   0x1     ## ULEB128 1
        .align  8
..Dciee0:
        .4byte  ..Dfdee0-..Dfdeb0       ## FDE length
..Dfdeb0:
        .4byte  ..Dcieb0        ## CIE pointer
        .quad   ..Dcfb0 ## initial location
        .quad   ..Dcfe0-..Dcfb0 ## address range
        .byte   0x4     ## DW_CFA_advance_loc4
        .4byte  ..Dcfi0-..Dcfb0
        .byte   0xe     ## DW_CFA_def_cfa_offset
        .byte   0x10    ## ULEB128 16
        .byte   0x86    ## DW_CFA_offset (col 6)
        .byte   0x2     ## ULEB128 2
        .byte   0x4     ## DW_CFA_advance_loc4
        .4byte  ..Dcfi1-..Dcfi0
        .byte   0xd     ## DW_CFA_def_cfa_register (col 6)
        .byte   0x6     ## ULEB128 6
        .align  8
..Dfdee0:
        .4byte  ..Dfdee1-..Dfdeb1       ## FDE length
..Dfdeb1:
        .4byte  ..Dcieb0        ## CIE pointer
        .quad   ..Dcfb1 ## initial location
        .quad   ..Dcfe1-..Dcfb1 ## address range
        .byte   0x4     ## DW_CFA_advance_loc4
        .4byte  ..Dcfi2-..Dcfb1
        .byte   0xe     ## DW_CFA_def_cfa_offset
        .byte   0x10    ## ULEB128 16
        .byte   0x86    ## DW_CFA_offset (col 6)
        .byte   0x2     ## ULEB128 2
        .byte   0x4     ## DW_CFA_advance_loc4
        .4byte  ..Dcfi3-..Dcfi2
        .byte   0xd     ## DW_CFA_def_cfa_register (col 6)
        .byte   0x6     ## ULEB128 6
        .align  8
..Dfdee1:
        .ident  "PGC 7.0-7"
[7:49] svbu-mpi:~/tmp % 

-----



On Jun 8, 2010, at 10:46 AM, George Bosilca wrote:

> It didn't work. Let's try with this small complete application:
> 
> #include <stdint.h>
> 
> #define SMPLOCK "lock;"
> 
> static inline int opal_atomic_cmpset_32( volatile int32_t *addr,
>                                        int32_t oldval, int32_t newval)
> {
>   unsigned char ret;
>   __asm__ __volatile__ (
>                       SMPLOCK "cmpxchgl %1,%2   \n\t"
>                               "sete     %0      \n\t"
>                       : "=qm" (ret)
>                       : "q"(newval), "m"(*addr), "a"(oldval)
>                       : "memory");
> 
>   return (int)ret;
> }
> 
> int main(int argc, char* argv[] )
> {
>  int32_t value = 0, oldval = 0, delta = 1;
>  int32_t* addr = &value;
> 
>  do {
>      oldval = *addr;
>  } while (0 == opal_atomic_cmpset_32(addr, oldval, oldval + delta));
>  return (oldval + delta);
> }
> 
> 
> 
>  Thanks,
>    george.
> 
> 
> On Jun 8, 2010, at 14:42 , Jeff Squyres wrote:
> 
>> Look at my output -- I did...
>> 
>> On Jun 8, 2010, at 10:40 AM, George Bosilca wrote:
>> 
>>> Still no good, the opal_atomic_cmpset_32 is not inlined. Try to add -O3 to 
>>> your command line, this helped for gcc.
>>> 
>>> Thanks,
>>>   george.
>>> 
>>> On Jun 8, 2010, at 14:14 , Jeff Squyres wrote:
>>> 
>>>> On Jun 8, 2010, at 9:53 AM, George Bosilca wrote:
>>>> 
>>>>> As you can see there is no explicit call, the opal_atomic_cmpset_32 is 
>>>>> really inlined. I think the problem is that you didn't specify the -O3 
>>>>> flag on your command line.
>>>> 
>>>> Ah, you wanted me to compile the OMPI code itself and send you the 
>>>> assembly.  That's not what you asked for.  :-)
>>>> 
>>>> (I just took the code you sent in the mail, stuffed it into george.c, and 
>>>> compiled that with -s -- outside of the context of the Open MPI code tree)
>>>> 
>>>> Here's the new output.  It still didn't inline, but you can see the code 
>>>> for the _cmpset function:
>>>> 
>>>> -----
>>>> [7:13] svbu-mpi:~/tmp % cat george.c                                       
>>>>     
>>>> #include <stdint.h>
>>>> 
>>>> #include "opal/sys/atomic.h"
>>>> 
>>>> int foo(void) {
>>>> int32_t oldval, delta;
>>>> int32_t *addr = 0;
>>>> do {
>>>>    oldval = *addr;
>>>> } while (0 == opal_atomic_cmpset_32(addr, oldval, oldval + delta));
>>>> return (oldval + delta);
>>>> }
>>>> 
>>>> [7:13] svbu-mpi:~/tmp % pgcc -O3 -I /home/jsquyres/svn/ompi4 
>>>> -I/home/jsquyres/svn/ompi4/opal/include -c -s george.c
>>>> [7:13] svbu-mpi:~/tmp % cat george.s                                       
>>>>            .file   "george.c"
>>>>     .version        "01.01"
>>>> ## PGC 7.0 -opt 1
>>>> ## PGC 06/08/2010  05:10:04
>>>> ## pgcc george.c -c -S
>>>> ## /opt/pgi/7.0.7/linux86-64/7.0-7/bin/pgc
>>>> ## george.c -opt 1 -terse 1 -inform warn -x 119 0xa10000 -x 122 0x40 -x 
>>>> 123 0x1000
>>>> ## -x 127 4 -x 127 16 -x 19 0x400000 -x 28 0x40000 -x 70 0x8000 -x 122 1 
>>>> -quad
>>>> ## -x 59 4 -x 59 4 -tp p7-64 -astype 0 -stdinc 
>>>> /opt/pgi/7.0.7/linux86-64/7.0-7/include:/usr/local/include:/usr/lib/gcc/x86_64-redhat-linux/4.1.2/include:/usr/lib/gcc/x86_64-redhat-linux/4.1.2/include:/usr/include
>>>> ## -def unix -def __unix -def __unix__ -def linux -def __linux -def 
>>>> __linux__
>>>> ## -def __NO_MATH_INLINES -def __x86_64__ -def 
>>>> __LONG_MAX__=9223372036854775807L
>>>> ## -def __SIZE_TYPE__=unsigned long int -def __PTRDIFF_TYPE__=long int 
>>>> -def __THROW=
>>>> ## -def __extension__= -def __amd64__ -def __SSE__ -def __MMX__ -def 
>>>> __SSE2__
>>>> ## -def __SSE3__ -predicate #machine(x86_64) #lint(off) #system(posix) 
>>>> #cpu(x86_64)
>>>> ## -cmdline +pgcc george.c -c -S -x 123 4 -x 123 0x80000000 -alwaysinline 
>>>> /opt/pgi/7.0.7/linux86-64/7.0-7/lib/libintrinsics.il 4
>>>> ## -asm george.s
>>>> ## lineno: 3
>>>>     .text
>>>>     .align  16
>>>>     .globl  foo
>>>> foo:
>>>> ..Dcfb0:
>>>>     pushq   %rbp
>>>> ..Dcfi0:
>>>>     movq    %rsp, %rbp
>>>> ..Dcfi1:
>>>>     subq    $16, %rsp
>>>> ..EN1:
>>>> ## lineno: 5
>>>>     movq    $0, -8(%rbp)
>>>>     .p2align        4,,3
>>>> .LB157:
>>>> ## lineno: 6
>>>>     movq    -8(%rbp), %rdi
>>>>     movl    (%rdi), %esi
>>>>     movl    %esi, -12(%rbp)
>>>>     movl    -16(%rbp), %edx
>>>>     addl    %esi, %edx
>>>>     xorl    %eax, %eax
>>>>     call    opal_atomic_cmpset_32
>>>>     testl   %eax, %eax
>>>>     je      .LB157
>>>>     movl    -16(%rbp), %eax
>>>>     addl    -12(%rbp), %eax
>>>> ## lineno: 10
>>>>     leave
>>>>     ret
>>>>     .type   foo,@function
>>>>     .size   foo,.-foo
>>>> ..Dcfe0:
>>>> __fooEND:
>>>>     .section        .pgi_trace
>>>>     .align  8
>>>>     .quad   foo     ## address of routine
>>>>     .quad   __fooEND - foo  ## size of routine
>>>>     .2byte  0       ## flags for future use
>>>>     .2byte  3       ## length of following string
>>>> ## name:foo:
>>>>     .byte   0x66,0x6f,0x6f,0x00
>>>>     .data
>>>>     .globl  opal_atomic_cmpset_32
>>>>     .section        .debug_frame
>>>> ..Dcieb0:
>>>>     .4byte  ..Dciee0-..Dcieb0-4     ## CIE length
>>>>     .4byte  0xffffffff      ## CIE ID
>>>>     .byte   0x1     ## CIE version
>>>>     .byte   0x0     ## no augmentation
>>>>     .byte   0x1     ## ULEB128 1, code alignment factor
>>>>     .byte   0x78    ## SLEB128 -8, data alignment factor
>>>>     .byte   0x10    ## return address column
>>>>     .byte   0xc     ## DW_CFA_def_cfa (col 7)
>>>>     .byte   0x7     ## ULEB128 7
>>>>     .byte   0x8     ## ULEB128 8
>>>>     .byte   0x90    ## DW_CFA_offset (col 16)
>>>>     .byte   0x1     ## ULEB128 1
>>>>     .align  8
>>>> ..Dciee0:
>>>>     .4byte  ..Dfdee0-..Dfdeb0       ## FDE length
>>>> ..Dfdeb0:
>>>>     .4byte  ..Dcieb0        ## CIE pointer
>>>>     .quad   ..Dcfb0 ## initial location
>>>>     .quad   ..Dcfe0-..Dcfb0 ## address range
>>>>     .byte   0x4     ## DW_CFA_advance_loc4
>>>>     .4byte  ..Dcfi0-..Dcfb0
>>>>     .byte   0xe     ## DW_CFA_def_cfa_offset
>>>>     .byte   0x10    ## ULEB128 16
>>>>     .byte   0x86    ## DW_CFA_offset (col 6)
>>>>     .byte   0x2     ## ULEB128 2
>>>>     .byte   0x4     ## DW_CFA_advance_loc4
>>>>     .4byte  ..Dcfi1-..Dcfi0
>>>>     .byte   0xd     ## DW_CFA_def_cfa_register (col 6)
>>>>     .byte   0x6     ## ULEB128 6
>>>>     .align  8
>>>> ..Dfdee0:
>>>>     .ident  "PGC 7.0-7"
>>>> [7:13] svbu-mpi:~/tmp %
>>>> -----
>>>> 
>>>> --
>>>> Jeff Squyres
>>>> jsquy...@cisco.com
>>>> For corporate legal information go to:
>>>> http://www.cisco.com/web/about/doing_business/legal/cri/
>>>> 
>>>> 
>>>> _______________________________________________
>>>> devel mailing list
>>>> de...@open-mpi.org
>>>> http://www.open-mpi.org/mailman/listinfo.cgi/devel
>>> 
>>> 
>>> _______________________________________________
>>> devel mailing list
>>> de...@open-mpi.org
>>> http://www.open-mpi.org/mailman/listinfo.cgi/devel
>>> 
>> 
>> 
>> -- 
>> Jeff Squyres
>> jsquy...@cisco.com
>> For corporate legal information go to:
>> http://www.cisco.com/web/about/doing_business/legal/cri/
>> 
>> 
>> _______________________________________________
>> devel mailing list
>> de...@open-mpi.org
>> http://www.open-mpi.org/mailman/listinfo.cgi/devel
> 
> 
> _______________________________________________
> devel mailing list
> de...@open-mpi.org
> http://www.open-mpi.org/mailman/listinfo.cgi/devel


-- 
Jeff Squyres
jsquy...@cisco.com
For corporate legal information go to:
http://www.cisco.com/web/about/doing_business/legal/cri/


Reply via email to