We talked about this on the phone on the call today.  I filed 
https://svn.open-mpi.org/trac/ompi/ticket/2437 about what we plan to do.


On Jun 8, 2010, at 11:40 AM, George Bosilca wrote:

> 
> On Jun 8, 2010, at 14:49 , Jeff Squyres wrote:
> 
> > ## -cmdline +pgcc george2.c -O3 -c -S -x 123 4 -x 123 0x80000000 
> > -alwaysinline /opt/pgi/7.0.7/linux86-64/7.0-7/lib/libintrinsics.il 4
> > ## -asm george2.s
> > ## lineno: 7
> >       .text
> >       .align  16
> > opal_atomic_cmpset_32:
> > ..Dcfb0:
> >       pushq   %rbp
> > ..Dcfi0:
> >       movq    %rsp, %rbp
> > ..Dcfi1:
> >       movl    %esi, -4(%rbp)
> >       movl    %edx, -8(%rbp)
> > ..EN1:
> > ## lineno: 16
> >       movl    -8(%rbp), %edx
> >       movl    -4(%rbp), %eax
> 
> oldval is moved into the %eax ... once
> 
> > lock;cmpxchgl %edx,(%rdi)  
> >       sete     %cl     
> 
> The CCR is retrieved
> 
> >       movb    %cl, -9(%rbp)
> 
> And stored.
> 
> But the loop disappeared. I really have a doubt about the correctness of this 
> assembly code.
> 
>   george.
> 
> > ## lineno: 17
> >       movzbl  -9(%rbp), %eax
> > ## lineno: 0
> >       popq    %rbp
> >       ret
> >       .type   opal_atomic_cmpset_32,@function
> >       .size   opal_atomic_cmpset_32,.-opal_atomic_cmpset_32
> > ..Dcfe0:
> > __opal_atomic_cmpset_32END:
> >       .section        .pgi_trace
> >       .align  8
> >       .quad   opal_atomic_cmpset_32   ## address of routine
> >       .quad   __opal_atomic_cmpset_32END - opal_atomic_cmpset_32      ## 
> > size of routine
> >       .2byte  0       ## flags for future use
> >       .2byte  21      ## length of following string
> > ## name:opal_atomic_cmpset_32:
> >       .byte   0x6f,0x70,0x61,0x6c,0x5f,0x61,0x74,0x6f,0x6d,0x69,0x63
> >       .byte   0x5f,0x63,0x6d,0x70,0x73,0x65,0x74,0x5f,0x33,0x32,0x00
> > ## lineno: 20
> >       .text
> >       .align  16
> >       .globl  main
> > main:
> > ..Dcfb1:
> >       pushq   %rbp
> > ..Dcfi2:
> >       movq    %rsp, %rbp
> > ..Dcfi3:
> >       subq    $16, %rsp
> >       movq    %rbx, -16(%rbp)
> >       pushq   %rax
> >       pushq   %rax
> >       stmxcsr (%rsp)
> >       popq    %rax
> >       orq     $64, %rax
> >       pushq   %rax
> >       ldmxcsr (%rsp)
> >       popq    %rax
> >       popq    %rax
> > ## lineno: 0
> > ..EN2:
> > ## lineno: 21
> >       movl    $0, -4(%rbp)
> >       .align  8
> > .LB191:
> > ## lineno: 24
> >       movl    -4(%rbp), %ebx
> >       leaq    -4(%rbp), %rdi
> >       leal    1(%rbx), %edx
> >       movl    %ebx, %esi
> >       call    opal_atomic_cmpset_32
> >       testl   %eax, %eax
> >       je      .LB191
> > ## lineno: 27
> >       addl    $1, %ebx
> >       movl    %ebx, %eax
> > ## lineno: 28
> >       movq    -16(%rbp), %rbx
> >       leave
> >       ret
> >       .type   main,@function
> >       .size   main,.-main
> > ..Dcfe1:
> > __mainEND:
> >       .section        .pgi_trace
> >       .align  8
> >       .quad   main    ## address of routine
> >       .quad   __mainEND - main        ## size of routine
> >       .2byte  0       ## flags for future use
> >       .2byte  4       ## length of following string
> > ## name:main:
> >       .byte   0x6d,0x61,0x69,0x6e,0x00
> >       .data
> >       .section        .debug_frame
> > ..Dcieb0:
> >       .4byte  ..Dciee0-..Dcieb0-4     ## CIE length
> >       .4byte  0xffffffff      ## CIE ID
> >       .byte   0x1     ## CIE version
> >       .byte   0x0     ## no augmentation
> >       .byte   0x1     ## ULEB128 1, code alignment factor
> >       .byte   0x78    ## SLEB128 -8, data alignment factor
> >       .byte   0x10    ## return address column
> >       .byte   0xc     ## DW_CFA_def_cfa (col 7)
> >       .byte   0x7     ## ULEB128 7
> >       .byte   0x8     ## ULEB128 8
> >       .byte   0x90    ## DW_CFA_offset (col 16)
> >       .byte   0x1     ## ULEB128 1
> >       .align  8
> > ..Dciee0:
> >       .4byte  ..Dfdee0-..Dfdeb0       ## FDE length
> > ..Dfdeb0:
> >       .4byte  ..Dcieb0        ## CIE pointer
> >       .quad   ..Dcfb0 ## initial location
> >       .quad   ..Dcfe0-..Dcfb0 ## address range
> >       .byte   0x4     ## DW_CFA_advance_loc4
> >       .4byte  ..Dcfi0-..Dcfb0
> >       .byte   0xe     ## DW_CFA_def_cfa_offset
> >       .byte   0x10    ## ULEB128 16
> >       .byte   0x86    ## DW_CFA_offset (col 6)
> >       .byte   0x2     ## ULEB128 2
> >       .byte   0x4     ## DW_CFA_advance_loc4
> >       .4byte  ..Dcfi1-..Dcfi0
> >       .byte   0xd     ## DW_CFA_def_cfa_register (col 6)
> >       .byte   0x6     ## ULEB128 6
> >       .align  8
> > ..Dfdee0:
> >       .4byte  ..Dfdee1-..Dfdeb1       ## FDE length
> > ..Dfdeb1:
> >       .4byte  ..Dcieb0        ## CIE pointer
> >       .quad   ..Dcfb1 ## initial location
> >       .quad   ..Dcfe1-..Dcfb1 ## address range
> >       .byte   0x4     ## DW_CFA_advance_loc4
> >       .4byte  ..Dcfi2-..Dcfb1
> >       .byte   0xe     ## DW_CFA_def_cfa_offset
> >       .byte   0x10    ## ULEB128 16
> >       .byte   0x86    ## DW_CFA_offset (col 6)
> >       .byte   0x2     ## ULEB128 2
> >       .byte   0x4     ## DW_CFA_advance_loc4
> >       .4byte  ..Dcfi3-..Dcfi2
> >       .byte   0xd     ## DW_CFA_def_cfa_register (col 6)
> >       .byte   0x6     ## ULEB128 6
> >       .align  8
> > ..Dfdee1:
> >       .ident  "PGC 7.0-7"
> > [7:49] svbu-mpi:~/tmp %
> >
> > -----
> >
> >
> >
> > On Jun 8, 2010, at 10:46 AM, George Bosilca wrote:
> >
> >> It didn't work. Let's try with this small complete application:
> >>
> >> #include <stdint.h>
> >>
> >> #define SMPLOCK "lock;"
> >>
> >> static inline int opal_atomic_cmpset_32( volatile int32_t *addr,
> >>                                       int32_t oldval, int32_t newval)
> >> {
> >>  unsigned char ret;
> >>  __asm__ __volatile__ (
> >>                      SMPLOCK "cmpxchgl %1,%2   \n\t"
> >>                              "sete     %0      \n\t"
> >>                      : "=qm" (ret)
> >>                      : "q"(newval), "m"(*addr), "a"(oldval)
> >>                      : "memory");
> >>
> >>  return (int)ret;
> >> }
> >>
> >> int main(int argc, char* argv[] )
> >> {
> >> int32_t value = 0, oldval = 0, delta = 1;
> >> int32_t* addr = &value;
> >>
> >> do {
> >>     oldval = *addr;
> >> } while (0 == opal_atomic_cmpset_32(addr, oldval, oldval + delta));
> >> return (oldval + delta);
> >> }
> >>
> >>
> >>
> >> Thanks,
> >>   george.
> >>
> >>
> >> On Jun 8, 2010, at 14:42 , Jeff Squyres wrote:
> >>
> >>> Look at my output -- I did...
> >>>
> >>> On Jun 8, 2010, at 10:40 AM, George Bosilca wrote:
> >>>
> >>>> Still no good, the opal_atomic_cmpset_32 is not inlined. Try to add -O3 
> >>>> to your command line, this helped for gcc.
> >>>>
> >>>> Thanks,
> >>>>  george.
> >>>>
> >>>> On Jun 8, 2010, at 14:14 , Jeff Squyres wrote:
> >>>>
> >>>>> On Jun 8, 2010, at 9:53 AM, George Bosilca wrote:
> >>>>>
> >>>>>> As you can see there is no explicit call, the opal_atomic_cmpset_32 is 
> >>>>>> really inlined. I think the problem is that you didn't specify the -O3 
> >>>>>> flag on your command line.
> >>>>>
> >>>>> Ah, you wanted me to compile the OMPI code itself and send you the 
> >>>>> assembly.  That's not what you asked for.  :-)
> >>>>>
> >>>>> (I just took the code you sent in the mail, stuffed it into george.c, 
> >>>>> and compiled that with -s -- outside of the context of the Open MPI 
> >>>>> code tree)
> >>>>>
> >>>>> Here's the new output.  It still didn't inline, but you can see the 
> >>>>> code for the _cmpset function:
> >>>>>
> >>>>> -----
> >>>>> [7:13] svbu-mpi:~/tmp % cat george.c                                    
> >>>>>       
> >>>>> #include <stdint.h>
> >>>>>
> >>>>> #include "opal/sys/atomic.h"
> >>>>>
> >>>>> int foo(void) {
> >>>>> int32_t oldval, delta;
> >>>>> int32_t *addr = 0;
> >>>>> do {
> >>>>>   oldval = *addr;
> >>>>> } while (0 == opal_atomic_cmpset_32(addr, oldval, oldval + delta));
> >>>>> return (oldval + delta);
> >>>>> }
> >>>>>
> >>>>> [7:13] svbu-mpi:~/tmp % pgcc -O3 -I /home/jsquyres/svn/ompi4 
> >>>>> -I/home/jsquyres/svn/ompi4/opal/include -c -s george.c
> >>>>> [7:13] svbu-mpi:~/tmp % cat george.s                                    
> >>>>>               .file   "george.c"
> >>>>>    .version        "01.01"
> >>>>> ## PGC 7.0 -opt 1
> >>>>> ## PGC 06/08/2010  05:10:04
> >>>>> ## pgcc george.c -c -S
> >>>>> ## /opt/pgi/7.0.7/linux86-64/7.0-7/bin/pgc
> >>>>> ## george.c -opt 1 -terse 1 -inform warn -x 119 0xa10000 -x 122 0x40 -x 
> >>>>> 123 0x1000
> >>>>> ## -x 127 4 -x 127 16 -x 19 0x400000 -x 28 0x40000 -x 70 0x8000 -x 122 
> >>>>> 1 -quad
> >>>>> ## -x 59 4 -x 59 4 -tp p7-64 -astype 0 -stdinc 
> >>>>> /opt/pgi/7.0.7/linux86-64/7.0-7/include:/usr/local/include:/usr/lib/gcc/x86_64-redhat-linux/4.1.2/include:/usr/lib/gcc/x86_64-redhat-linux/4.1.2/include:/usr/include
> >>>>> ## -def unix -def __unix -def __unix__ -def linux -def __linux -def 
> >>>>> __linux__
> >>>>> ## -def __NO_MATH_INLINES -def __x86_64__ -def 
> >>>>> __LONG_MAX__=9223372036854775807L
> >>>>> ## -def __SIZE_TYPE__=unsigned long int -def __PTRDIFF_TYPE__=long int 
> >>>>> -def __THROW=
> >>>>> ## -def __extension__= -def __amd64__ -def __SSE__ -def __MMX__ -def 
> >>>>> __SSE2__
> >>>>> ## -def __SSE3__ -predicate #machine(x86_64) #lint(off) #system(posix) 
> >>>>> #cpu(x86_64)
> >>>>> ## -cmdline +pgcc george.c -c -S -x 123 4 -x 123 0x80000000 
> >>>>> -alwaysinline /opt/pgi/7.0.7/linux86-64/7.0-7/lib/libintrinsics.il 4
> >>>>> ## -asm george.s
> >>>>> ## lineno: 3
> >>>>>    .text
> >>>>>    .align  16
> >>>>>    .globl  foo
> >>>>> foo:
> >>>>> ..Dcfb0:
> >>>>>    pushq   %rbp
> >>>>> ..Dcfi0:
> >>>>>    movq    %rsp, %rbp
> >>>>> ..Dcfi1:
> >>>>>    subq    $16, %rsp
> >>>>> ..EN1:
> >>>>> ## lineno: 5
> >>>>>    movq    $0, -8(%rbp)
> >>>>>    .p2align        4,,3
> >>>>> .LB157:
> >>>>> ## lineno: 6
> >>>>>    movq    -8(%rbp), %rdi
> >>>>>    movl    (%rdi), %esi
> >>>>>    movl    %esi, -12(%rbp)
> >>>>>    movl    -16(%rbp), %edx
> >>>>>    addl    %esi, %edx
> >>>>>    xorl    %eax, %eax
> >>>>>    call    opal_atomic_cmpset_32
> >>>>>    testl   %eax, %eax
> >>>>>    je      .LB157
> >>>>>    movl    -16(%rbp), %eax
> >>>>>    addl    -12(%rbp), %eax
> >>>>> ## lineno: 10
> >>>>>    leave
> >>>>>    ret
> >>>>>    .type   foo,@function
> >>>>>    .size   foo,.-foo
> >>>>> ..Dcfe0:
> >>>>> __fooEND:
> >>>>>    .section        .pgi_trace
> >>>>>    .align  8
> >>>>>    .quad   foo     ## address of routine
> >>>>>    .quad   __fooEND - foo  ## size of routine
> >>>>>    .2byte  0       ## flags for future use
> >>>>>    .2byte  3       ## length of following string
> >>>>> ## name:foo:
> >>>>>    .byte   0x66,0x6f,0x6f,0x00
> >>>>>    .data
> >>>>>    .globl  opal_atomic_cmpset_32
> >>>>>    .section        .debug_frame
> >>>>> ..Dcieb0:
> >>>>>    .4byte  ..Dciee0-..Dcieb0-4     ## CIE length
> >>>>>    .4byte  0xffffffff      ## CIE ID
> >>>>>    .byte   0x1     ## CIE version
> >>>>>    .byte   0x0     ## no augmentation
> >>>>>    .byte   0x1     ## ULEB128 1, code alignment factor
> >>>>>    .byte   0x78    ## SLEB128 -8, data alignment factor
> >>>>>    .byte   0x10    ## return address column
> >>>>>    .byte   0xc     ## DW_CFA_def_cfa (col 7)
> >>>>>    .byte   0x7     ## ULEB128 7
> >>>>>    .byte   0x8     ## ULEB128 8
> >>>>>    .byte   0x90    ## DW_CFA_offset (col 16)
> >>>>>    .byte   0x1     ## ULEB128 1
> >>>>>    .align  8
> >>>>> ..Dciee0:
> >>>>>    .4byte  ..Dfdee0-..Dfdeb0       ## FDE length
> >>>>> ..Dfdeb0:
> >>>>>    .4byte  ..Dcieb0        ## CIE pointer
> >>>>>    .quad   ..Dcfb0 ## initial location
> >>>>>    .quad   ..Dcfe0-..Dcfb0 ## address range
> >>>>>    .byte   0x4     ## DW_CFA_advance_loc4
> >>>>>    .4byte  ..Dcfi0-..Dcfb0
> >>>>>    .byte   0xe     ## DW_CFA_def_cfa_offset
> >>>>>    .byte   0x10    ## ULEB128 16
> >>>>>    .byte   0x86    ## DW_CFA_offset (col 6)
> >>>>>    .byte   0x2     ## ULEB128 2
> >>>>>    .byte   0x4     ## DW_CFA_advance_loc4
> >>>>>    .4byte  ..Dcfi1-..Dcfi0
> >>>>>    .byte   0xd     ## DW_CFA_def_cfa_register (col 6)
> >>>>>    .byte   0x6     ## ULEB128 6
> >>>>>    .align  8
> >>>>> ..Dfdee0:
> >>>>>    .ident  "PGC 7.0-7"
> >>>>> [7:13] svbu-mpi:~/tmp %
> >>>>> -----
> >>>>>
> >>>>> --
> >>>>> Jeff Squyres
> >>>>> jsquy...@cisco.com
> >>>>> For corporate legal information go to:
> >>>>> http://www.cisco.com/web/about/doing_business/legal/cri/
> >>>>>
> >>>>>
> >>>>> _______________________________________________
> >>>>> devel mailing list
> >>>>> de...@open-mpi.org
> >>>>> http://www.open-mpi.org/mailman/listinfo.cgi/devel
> >>>>
> >>>>
> >>>> _______________________________________________
> >>>> devel mailing list
> >>>> de...@open-mpi.org
> >>>> http://www.open-mpi.org/mailman/listinfo.cgi/devel
> >>>>
> >>>
> >>>
> >>> --
> >>> Jeff Squyres
> >>> jsquy...@cisco.com
> >>> For corporate legal information go to:
> >>> http://www.cisco.com/web/about/doing_business/legal/cri/
> >>>
> >>>
> >>> _______________________________________________
> >>> devel mailing list
> >>> de...@open-mpi.org
> >>> http://www.open-mpi.org/mailman/listinfo.cgi/devel
> >>
> >>
> >> _______________________________________________
> >> devel mailing list
> >> de...@open-mpi.org
> >> http://www.open-mpi.org/mailman/listinfo.cgi/devel
> >
> >
> > --
> > Jeff Squyres
> > jsquy...@cisco.com
> > For corporate legal information go to:
> > http://www.cisco.com/web/about/doing_business/legal/cri/
> >
> >
> > _______________________________________________
> > devel mailing list
> > de...@open-mpi.org
> > http://www.open-mpi.org/mailman/listinfo.cgi/devel
> 
> 
> _______________________________________________
> devel mailing list
> de...@open-mpi.org
> http://www.open-mpi.org/mailman/listinfo.cgi/devel
> 


-- 
Jeff Squyres
jsquy...@cisco.com
For corporate legal information go to:
http://www.cisco.com/web/about/doing_business/legal/cri/


Reply via email to