On Jun 8, 2010, at 14:49 , Jeff Squyres wrote:

> ## -cmdline +pgcc george2.c -O3 -c -S -x 123 4 -x 123 0x80000000 
> -alwaysinline /opt/pgi/7.0.7/linux86-64/7.0-7/lib/libintrinsics.il 4
> ## -asm george2.s
> ## lineno: 7
>       .text
>       .align  16
> opal_atomic_cmpset_32:
> ..Dcfb0:
>       pushq   %rbp
> ..Dcfi0:
>       movq    %rsp, %rbp
> ..Dcfi1:
>       movl    %esi, -4(%rbp)
>       movl    %edx, -8(%rbp)
> ..EN1:
> ## lineno: 16
>       movl    -8(%rbp), %edx
>       movl    -4(%rbp), %eax

oldval is moved into the %eax ... once

> lock;cmpxchgl %edx,(%rdi)   
>       sete     %cl      

The CCR is retrieved

>       movb    %cl, -9(%rbp)

And stored.

But the loop disappeared. I really have a doubt about the correctness of this 
assembly code.

  george.

> ## lineno: 17
>       movzbl  -9(%rbp), %eax
> ## lineno: 0
>       popq    %rbp
>       ret
>       .type   opal_atomic_cmpset_32,@function
>       .size   opal_atomic_cmpset_32,.-opal_atomic_cmpset_32
> ..Dcfe0:
> __opal_atomic_cmpset_32END:
>       .section        .pgi_trace
>       .align  8
>       .quad   opal_atomic_cmpset_32   ## address of routine
>       .quad   __opal_atomic_cmpset_32END - opal_atomic_cmpset_32      ## size 
> of routine
>       .2byte  0       ## flags for future use
>       .2byte  21      ## length of following string
> ## name:opal_atomic_cmpset_32:
>       .byte   0x6f,0x70,0x61,0x6c,0x5f,0x61,0x74,0x6f,0x6d,0x69,0x63
>       .byte   0x5f,0x63,0x6d,0x70,0x73,0x65,0x74,0x5f,0x33,0x32,0x00
> ## lineno: 20
>       .text
>       .align  16
>       .globl  main
> main:
> ..Dcfb1:
>       pushq   %rbp
> ..Dcfi2:
>       movq    %rsp, %rbp
> ..Dcfi3:
>       subq    $16, %rsp
>       movq    %rbx, -16(%rbp)
>       pushq   %rax
>       pushq   %rax
>       stmxcsr (%rsp)
>       popq    %rax
>       orq     $64, %rax
>       pushq   %rax
>       ldmxcsr (%rsp)
>       popq    %rax
>       popq    %rax
> ## lineno: 0
> ..EN2:
> ## lineno: 21
>       movl    $0, -4(%rbp)
>       .align  8
> .LB191:
> ## lineno: 24
>       movl    -4(%rbp), %ebx
>       leaq    -4(%rbp), %rdi
>       leal    1(%rbx), %edx
>       movl    %ebx, %esi
>       call    opal_atomic_cmpset_32
>       testl   %eax, %eax
>       je      .LB191
> ## lineno: 27
>       addl    $1, %ebx
>       movl    %ebx, %eax
> ## lineno: 28
>       movq    -16(%rbp), %rbx
>       leave
>       ret
>       .type   main,@function
>       .size   main,.-main
> ..Dcfe1:
> __mainEND:
>       .section        .pgi_trace
>       .align  8
>       .quad   main    ## address of routine
>       .quad   __mainEND - main        ## size of routine
>       .2byte  0       ## flags for future use
>       .2byte  4       ## length of following string
> ## name:main:
>       .byte   0x6d,0x61,0x69,0x6e,0x00
>       .data
>       .section        .debug_frame
> ..Dcieb0:
>       .4byte  ..Dciee0-..Dcieb0-4     ## CIE length
>       .4byte  0xffffffff      ## CIE ID
>       .byte   0x1     ## CIE version
>       .byte   0x0     ## no augmentation
>       .byte   0x1     ## ULEB128 1, code alignment factor
>       .byte   0x78    ## SLEB128 -8, data alignment factor
>       .byte   0x10    ## return address column
>       .byte   0xc     ## DW_CFA_def_cfa (col 7)
>       .byte   0x7     ## ULEB128 7
>       .byte   0x8     ## ULEB128 8
>       .byte   0x90    ## DW_CFA_offset (col 16)
>       .byte   0x1     ## ULEB128 1
>       .align  8
> ..Dciee0:
>       .4byte  ..Dfdee0-..Dfdeb0       ## FDE length
> ..Dfdeb0:
>       .4byte  ..Dcieb0        ## CIE pointer
>       .quad   ..Dcfb0 ## initial location
>       .quad   ..Dcfe0-..Dcfb0 ## address range
>       .byte   0x4     ## DW_CFA_advance_loc4
>       .4byte  ..Dcfi0-..Dcfb0
>       .byte   0xe     ## DW_CFA_def_cfa_offset
>       .byte   0x10    ## ULEB128 16
>       .byte   0x86    ## DW_CFA_offset (col 6)
>       .byte   0x2     ## ULEB128 2
>       .byte   0x4     ## DW_CFA_advance_loc4
>       .4byte  ..Dcfi1-..Dcfi0
>       .byte   0xd     ## DW_CFA_def_cfa_register (col 6)
>       .byte   0x6     ## ULEB128 6
>       .align  8
> ..Dfdee0:
>       .4byte  ..Dfdee1-..Dfdeb1       ## FDE length
> ..Dfdeb1:
>       .4byte  ..Dcieb0        ## CIE pointer
>       .quad   ..Dcfb1 ## initial location
>       .quad   ..Dcfe1-..Dcfb1 ## address range
>       .byte   0x4     ## DW_CFA_advance_loc4
>       .4byte  ..Dcfi2-..Dcfb1
>       .byte   0xe     ## DW_CFA_def_cfa_offset
>       .byte   0x10    ## ULEB128 16
>       .byte   0x86    ## DW_CFA_offset (col 6)
>       .byte   0x2     ## ULEB128 2
>       .byte   0x4     ## DW_CFA_advance_loc4
>       .4byte  ..Dcfi3-..Dcfi2
>       .byte   0xd     ## DW_CFA_def_cfa_register (col 6)
>       .byte   0x6     ## ULEB128 6
>       .align  8
> ..Dfdee1:
>       .ident  "PGC 7.0-7"
> [7:49] svbu-mpi:~/tmp % 
> 
> -----
> 
> 
> 
> On Jun 8, 2010, at 10:46 AM, George Bosilca wrote:
> 
>> It didn't work. Let's try with this small complete application:
>> 
>> #include <stdint.h>
>> 
>> #define SMPLOCK "lock;"
>> 
>> static inline int opal_atomic_cmpset_32( volatile int32_t *addr,
>>                                       int32_t oldval, int32_t newval)
>> {
>>  unsigned char ret;
>>  __asm__ __volatile__ (
>>                      SMPLOCK "cmpxchgl %1,%2   \n\t"
>>                              "sete     %0      \n\t"
>>                      : "=qm" (ret)
>>                      : "q"(newval), "m"(*addr), "a"(oldval)
>>                      : "memory");
>> 
>>  return (int)ret;
>> }
>> 
>> int main(int argc, char* argv[] )
>> {
>> int32_t value = 0, oldval = 0, delta = 1;
>> int32_t* addr = &value;
>> 
>> do {
>>     oldval = *addr;
>> } while (0 == opal_atomic_cmpset_32(addr, oldval, oldval + delta));
>> return (oldval + delta);
>> }
>> 
>> 
>> 
>> Thanks,
>>   george.
>> 
>> 
>> On Jun 8, 2010, at 14:42 , Jeff Squyres wrote:
>> 
>>> Look at my output -- I did...
>>> 
>>> On Jun 8, 2010, at 10:40 AM, George Bosilca wrote:
>>> 
>>>> Still no good, the opal_atomic_cmpset_32 is not inlined. Try to add -O3 to 
>>>> your command line, this helped for gcc.
>>>> 
>>>> Thanks,
>>>>  george.
>>>> 
>>>> On Jun 8, 2010, at 14:14 , Jeff Squyres wrote:
>>>> 
>>>>> On Jun 8, 2010, at 9:53 AM, George Bosilca wrote:
>>>>> 
>>>>>> As you can see there is no explicit call, the opal_atomic_cmpset_32 is 
>>>>>> really inlined. I think the problem is that you didn't specify the -O3 
>>>>>> flag on your command line.
>>>>> 
>>>>> Ah, you wanted me to compile the OMPI code itself and send you the 
>>>>> assembly.  That's not what you asked for.  :-)
>>>>> 
>>>>> (I just took the code you sent in the mail, stuffed it into george.c, and 
>>>>> compiled that with -s -- outside of the context of the Open MPI code tree)
>>>>> 
>>>>> Here's the new output.  It still didn't inline, but you can see the code 
>>>>> for the _cmpset function:
>>>>> 
>>>>> -----
>>>>> [7:13] svbu-mpi:~/tmp % cat george.c                                      
>>>>>      
>>>>> #include <stdint.h>
>>>>> 
>>>>> #include "opal/sys/atomic.h"
>>>>> 
>>>>> int foo(void) {
>>>>> int32_t oldval, delta;
>>>>> int32_t *addr = 0;
>>>>> do {
>>>>>   oldval = *addr;
>>>>> } while (0 == opal_atomic_cmpset_32(addr, oldval, oldval + delta));
>>>>> return (oldval + delta);
>>>>> }
>>>>> 
>>>>> [7:13] svbu-mpi:~/tmp % pgcc -O3 -I /home/jsquyres/svn/ompi4 
>>>>> -I/home/jsquyres/svn/ompi4/opal/include -c -s george.c
>>>>> [7:13] svbu-mpi:~/tmp % cat george.s                                      
>>>>>             .file   "george.c"
>>>>>    .version        "01.01"
>>>>> ## PGC 7.0 -opt 1
>>>>> ## PGC 06/08/2010  05:10:04
>>>>> ## pgcc george.c -c -S
>>>>> ## /opt/pgi/7.0.7/linux86-64/7.0-7/bin/pgc
>>>>> ## george.c -opt 1 -terse 1 -inform warn -x 119 0xa10000 -x 122 0x40 -x 
>>>>> 123 0x1000
>>>>> ## -x 127 4 -x 127 16 -x 19 0x400000 -x 28 0x40000 -x 70 0x8000 -x 122 1 
>>>>> -quad
>>>>> ## -x 59 4 -x 59 4 -tp p7-64 -astype 0 -stdinc 
>>>>> /opt/pgi/7.0.7/linux86-64/7.0-7/include:/usr/local/include:/usr/lib/gcc/x86_64-redhat-linux/4.1.2/include:/usr/lib/gcc/x86_64-redhat-linux/4.1.2/include:/usr/include
>>>>> ## -def unix -def __unix -def __unix__ -def linux -def __linux -def 
>>>>> __linux__
>>>>> ## -def __NO_MATH_INLINES -def __x86_64__ -def 
>>>>> __LONG_MAX__=9223372036854775807L
>>>>> ## -def __SIZE_TYPE__=unsigned long int -def __PTRDIFF_TYPE__=long int 
>>>>> -def __THROW=
>>>>> ## -def __extension__= -def __amd64__ -def __SSE__ -def __MMX__ -def 
>>>>> __SSE2__
>>>>> ## -def __SSE3__ -predicate #machine(x86_64) #lint(off) #system(posix) 
>>>>> #cpu(x86_64)
>>>>> ## -cmdline +pgcc george.c -c -S -x 123 4 -x 123 0x80000000 -alwaysinline 
>>>>> /opt/pgi/7.0.7/linux86-64/7.0-7/lib/libintrinsics.il 4
>>>>> ## -asm george.s
>>>>> ## lineno: 3
>>>>>    .text
>>>>>    .align  16
>>>>>    .globl  foo
>>>>> foo:
>>>>> ..Dcfb0:
>>>>>    pushq   %rbp
>>>>> ..Dcfi0:
>>>>>    movq    %rsp, %rbp
>>>>> ..Dcfi1:
>>>>>    subq    $16, %rsp
>>>>> ..EN1:
>>>>> ## lineno: 5
>>>>>    movq    $0, -8(%rbp)
>>>>>    .p2align        4,,3
>>>>> .LB157:
>>>>> ## lineno: 6
>>>>>    movq    -8(%rbp), %rdi
>>>>>    movl    (%rdi), %esi
>>>>>    movl    %esi, -12(%rbp)
>>>>>    movl    -16(%rbp), %edx
>>>>>    addl    %esi, %edx
>>>>>    xorl    %eax, %eax
>>>>>    call    opal_atomic_cmpset_32
>>>>>    testl   %eax, %eax
>>>>>    je      .LB157
>>>>>    movl    -16(%rbp), %eax
>>>>>    addl    -12(%rbp), %eax
>>>>> ## lineno: 10
>>>>>    leave
>>>>>    ret
>>>>>    .type   foo,@function
>>>>>    .size   foo,.-foo
>>>>> ..Dcfe0:
>>>>> __fooEND:
>>>>>    .section        .pgi_trace
>>>>>    .align  8
>>>>>    .quad   foo     ## address of routine
>>>>>    .quad   __fooEND - foo  ## size of routine
>>>>>    .2byte  0       ## flags for future use
>>>>>    .2byte  3       ## length of following string
>>>>> ## name:foo:
>>>>>    .byte   0x66,0x6f,0x6f,0x00
>>>>>    .data
>>>>>    .globl  opal_atomic_cmpset_32
>>>>>    .section        .debug_frame
>>>>> ..Dcieb0:
>>>>>    .4byte  ..Dciee0-..Dcieb0-4     ## CIE length
>>>>>    .4byte  0xffffffff      ## CIE ID
>>>>>    .byte   0x1     ## CIE version
>>>>>    .byte   0x0     ## no augmentation
>>>>>    .byte   0x1     ## ULEB128 1, code alignment factor
>>>>>    .byte   0x78    ## SLEB128 -8, data alignment factor
>>>>>    .byte   0x10    ## return address column
>>>>>    .byte   0xc     ## DW_CFA_def_cfa (col 7)
>>>>>    .byte   0x7     ## ULEB128 7
>>>>>    .byte   0x8     ## ULEB128 8
>>>>>    .byte   0x90    ## DW_CFA_offset (col 16)
>>>>>    .byte   0x1     ## ULEB128 1
>>>>>    .align  8
>>>>> ..Dciee0:
>>>>>    .4byte  ..Dfdee0-..Dfdeb0       ## FDE length
>>>>> ..Dfdeb0:
>>>>>    .4byte  ..Dcieb0        ## CIE pointer
>>>>>    .quad   ..Dcfb0 ## initial location
>>>>>    .quad   ..Dcfe0-..Dcfb0 ## address range
>>>>>    .byte   0x4     ## DW_CFA_advance_loc4
>>>>>    .4byte  ..Dcfi0-..Dcfb0
>>>>>    .byte   0xe     ## DW_CFA_def_cfa_offset
>>>>>    .byte   0x10    ## ULEB128 16
>>>>>    .byte   0x86    ## DW_CFA_offset (col 6)
>>>>>    .byte   0x2     ## ULEB128 2
>>>>>    .byte   0x4     ## DW_CFA_advance_loc4
>>>>>    .4byte  ..Dcfi1-..Dcfi0
>>>>>    .byte   0xd     ## DW_CFA_def_cfa_register (col 6)
>>>>>    .byte   0x6     ## ULEB128 6
>>>>>    .align  8
>>>>> ..Dfdee0:
>>>>>    .ident  "PGC 7.0-7"
>>>>> [7:13] svbu-mpi:~/tmp %
>>>>> -----
>>>>> 
>>>>> --
>>>>> Jeff Squyres
>>>>> jsquy...@cisco.com
>>>>> For corporate legal information go to:
>>>>> http://www.cisco.com/web/about/doing_business/legal/cri/
>>>>> 
>>>>> 
>>>>> _______________________________________________
>>>>> devel mailing list
>>>>> de...@open-mpi.org
>>>>> http://www.open-mpi.org/mailman/listinfo.cgi/devel
>>>> 
>>>> 
>>>> _______________________________________________
>>>> devel mailing list
>>>> de...@open-mpi.org
>>>> http://www.open-mpi.org/mailman/listinfo.cgi/devel
>>>> 
>>> 
>>> 
>>> -- 
>>> Jeff Squyres
>>> jsquy...@cisco.com
>>> For corporate legal information go to:
>>> http://www.cisco.com/web/about/doing_business/legal/cri/
>>> 
>>> 
>>> _______________________________________________
>>> devel mailing list
>>> de...@open-mpi.org
>>> http://www.open-mpi.org/mailman/listinfo.cgi/devel
>> 
>> 
>> _______________________________________________
>> devel mailing list
>> de...@open-mpi.org
>> http://www.open-mpi.org/mailman/listinfo.cgi/devel
> 
> 
> -- 
> Jeff Squyres
> jsquy...@cisco.com
> For corporate legal information go to:
> http://www.cisco.com/web/about/doing_business/legal/cri/
> 
> 
> _______________________________________________
> devel mailing list
> de...@open-mpi.org
> http://www.open-mpi.org/mailman/listinfo.cgi/devel


Reply via email to