Oops -- I see the problem; I used "-s" instead of "-S". So that old output I
sent you was kinda meaningless; it was from some other test. Sorry! :-(
But your test is much smaller and easier to check, so let's do that:
-----
[7:49] svbu-mpi:~/tmp % cat george2.c
#include <stdint.h>
#define SMPLOCK "lock;"
static inline int opal_atomic_cmpset_32( volatile int32_t *addr,
int32_t oldval, int32_t newval)
{
unsigned char ret;
__asm__ __volatile__ (
SMPLOCK "cmpxchgl %1,%2 \n\t"
"sete %0 \n\t"
: "=qm" (ret)
: "q"(newval), "m"(*addr), "a"(oldval)
: "memory");
return (int)ret;
}
int main(int argc, char* argv[] )
{
int32_t value = 0, oldval = 0, delta = 1;
int32_t* addr = &value;
do {
oldval = *addr;
} while (0 == opal_atomic_cmpset_32(addr, oldval, oldval + delta));
return (oldval + delta);
}
[7:49] svbu-mpi:~/tmp % pgcc -O3 -c -S george2.c
[7:49] svbu-mpi:~/tmp % cat george2.s
.file "george2.c"
.version "01.01"
## PGC 7.0 -opt 3
## PGC 06/08/2010 07:49:33
## pgcc george2.c -O3 -c -S
## /opt/pgi/7.0.7/linux86-64/7.0-7/bin/pgc
## george2.c -opt 3 -terse 1 -inform warn -x 119 0xa10000 -x 122 0x40 -x 123
0x1000
## -x 127 4 -x 127 16 -x 19 0x400000 -x 28 0x40000 -x 70 0x8000 -x 122 1 -quad
## -x 59 4 -x 59 4 -tp p7-64 -astype 0 -stdinc
/opt/pgi/7.0.7/linux86-64/7.0-7/include:/usr/local/include:/usr/lib/gcc/x86_64-redhat-linux/4.1.2/include:/usr/lib/gcc/x86_64-redhat-linux/4.1.2/include:/usr/include
## -def unix -def __unix -def __unix__ -def linux -def __linux -def __linux__
## -def __NO_MATH_INLINES -def __x86_64__ -def __LONG_MAX__=9223372036854775807L
## -def __SIZE_TYPE__=unsigned long int -def __PTRDIFF_TYPE__=long int -def
__THROW=
## -def __extension__= -def __amd64__ -def __SSE__ -def __MMX__ -def __SSE2__
## -def __SSE3__ -predicate #machine(x86_64) #lint(off) #system(posix)
#cpu(x86_64)
## -cmdline +pgcc george2.c -O3 -c -S -x 123 4 -x 123 0x80000000 -alwaysinline
/opt/pgi/7.0.7/linux86-64/7.0-7/lib/libintrinsics.il 4
## -asm george2.s
## lineno: 7
.text
.align 16
opal_atomic_cmpset_32:
..Dcfb0:
pushq %rbp
..Dcfi0:
movq %rsp, %rbp
..Dcfi1:
movl %esi, -4(%rbp)
movl %edx, -8(%rbp)
..EN1:
## lineno: 16
movl -8(%rbp), %edx
movl -4(%rbp), %eax
lock;cmpxchgl %edx,(%rdi)
sete %cl
movb %cl, -9(%rbp)
## lineno: 17
movzbl -9(%rbp), %eax
## lineno: 0
popq %rbp
ret
.type opal_atomic_cmpset_32,@function
.size opal_atomic_cmpset_32,.-opal_atomic_cmpset_32
..Dcfe0:
__opal_atomic_cmpset_32END:
.section .pgi_trace
.align 8
.quad opal_atomic_cmpset_32 ## address of routine
.quad __opal_atomic_cmpset_32END - opal_atomic_cmpset_32 ## size
of routine
.2byte 0 ## flags for future use
.2byte 21 ## length of following string
## name:opal_atomic_cmpset_32:
.byte 0x6f,0x70,0x61,0x6c,0x5f,0x61,0x74,0x6f,0x6d,0x69,0x63
.byte 0x5f,0x63,0x6d,0x70,0x73,0x65,0x74,0x5f,0x33,0x32,0x00
## lineno: 20
.text
.align 16
.globl main
main:
..Dcfb1:
pushq %rbp
..Dcfi2:
movq %rsp, %rbp
..Dcfi3:
subq $16, %rsp
movq %rbx, -16(%rbp)
pushq %rax
pushq %rax
stmxcsr (%rsp)
popq %rax
orq $64, %rax
pushq %rax
ldmxcsr (%rsp)
popq %rax
popq %rax
## lineno: 0
..EN2:
## lineno: 21
movl $0, -4(%rbp)
.align 8
.LB191:
## lineno: 24
movl -4(%rbp), %ebx
leaq -4(%rbp), %rdi
leal 1(%rbx), %edx
movl %ebx, %esi
call opal_atomic_cmpset_32
testl %eax, %eax
je .LB191
## lineno: 27
addl $1, %ebx
movl %ebx, %eax
## lineno: 28
movq -16(%rbp), %rbx
leave
ret
.type main,@function
.size main,.-main
..Dcfe1:
__mainEND:
.section .pgi_trace
.align 8
.quad main ## address of routine
.quad __mainEND - main ## size of routine
.2byte 0 ## flags for future use
.2byte 4 ## length of following string
## name:main:
.byte 0x6d,0x61,0x69,0x6e,0x00
.data
.section .debug_frame
..Dcieb0:
.4byte ..Dciee0-..Dcieb0-4 ## CIE length
.4byte 0xffffffff ## CIE ID
.byte 0x1 ## CIE version
.byte 0x0 ## no augmentation
.byte 0x1 ## ULEB128 1, code alignment factor
.byte 0x78 ## SLEB128 -8, data alignment factor
.byte 0x10 ## return address column
.byte 0xc ## DW_CFA_def_cfa (col 7)
.byte 0x7 ## ULEB128 7
.byte 0x8 ## ULEB128 8
.byte 0x90 ## DW_CFA_offset (col 16)
.byte 0x1 ## ULEB128 1
.align 8
..Dciee0:
.4byte ..Dfdee0-..Dfdeb0 ## FDE length
..Dfdeb0:
.4byte ..Dcieb0 ## CIE pointer
.quad ..Dcfb0 ## initial location
.quad ..Dcfe0-..Dcfb0 ## address range
.byte 0x4 ## DW_CFA_advance_loc4
.4byte ..Dcfi0-..Dcfb0
.byte 0xe ## DW_CFA_def_cfa_offset
.byte 0x10 ## ULEB128 16
.byte 0x86 ## DW_CFA_offset (col 6)
.byte 0x2 ## ULEB128 2
.byte 0x4 ## DW_CFA_advance_loc4
.4byte ..Dcfi1-..Dcfi0
.byte 0xd ## DW_CFA_def_cfa_register (col 6)
.byte 0x6 ## ULEB128 6
.align 8
..Dfdee0:
.4byte ..Dfdee1-..Dfdeb1 ## FDE length
..Dfdeb1:
.4byte ..Dcieb0 ## CIE pointer
.quad ..Dcfb1 ## initial location
.quad ..Dcfe1-..Dcfb1 ## address range
.byte 0x4 ## DW_CFA_advance_loc4
.4byte ..Dcfi2-..Dcfb1
.byte 0xe ## DW_CFA_def_cfa_offset
.byte 0x10 ## ULEB128 16
.byte 0x86 ## DW_CFA_offset (col 6)
.byte 0x2 ## ULEB128 2
.byte 0x4 ## DW_CFA_advance_loc4
.4byte ..Dcfi3-..Dcfi2
.byte 0xd ## DW_CFA_def_cfa_register (col 6)
.byte 0x6 ## ULEB128 6
.align 8
..Dfdee1:
.ident "PGC 7.0-7"
[7:49] svbu-mpi:~/tmp %
-----
On Jun 8, 2010, at 10:46 AM, George Bosilca wrote:
> It didn't work. Let's try with this small complete application:
>
> #include <stdint.h>
>
> #define SMPLOCK "lock;"
>
> static inline int opal_atomic_cmpset_32( volatile int32_t *addr,
> int32_t oldval, int32_t newval)
> {
> unsigned char ret;
> __asm__ __volatile__ (
> SMPLOCK "cmpxchgl %1,%2 \n\t"
> "sete %0 \n\t"
> : "=qm" (ret)
> : "q"(newval), "m"(*addr), "a"(oldval)
> : "memory");
>
> return (int)ret;
> }
>
> int main(int argc, char* argv[] )
> {
> int32_t value = 0, oldval = 0, delta = 1;
> int32_t* addr = &value;
>
> do {
> oldval = *addr;
> } while (0 == opal_atomic_cmpset_32(addr, oldval, oldval + delta));
> return (oldval + delta);
> }
>
>
>
> Thanks,
> george.
>
>
> On Jun 8, 2010, at 14:42 , Jeff Squyres wrote:
>
>> Look at my output -- I did...
>>
>> On Jun 8, 2010, at 10:40 AM, George Bosilca wrote:
>>
>>> Still no good, the opal_atomic_cmpset_32 is not inlined. Try to add -O3 to
>>> your command line, this helped for gcc.
>>>
>>> Thanks,
>>> george.
>>>
>>> On Jun 8, 2010, at 14:14 , Jeff Squyres wrote:
>>>
>>>> On Jun 8, 2010, at 9:53 AM, George Bosilca wrote:
>>>>
>>>>> As you can see there is no explicit call, the opal_atomic_cmpset_32 is
>>>>> really inlined. I think the problem is that you didn't specify the -O3
>>>>> flag on your command line.
>>>>
>>>> Ah, you wanted me to compile the OMPI code itself and send you the
>>>> assembly. That's not what you asked for. :-)
>>>>
>>>> (I just took the code you sent in the mail, stuffed it into george.c, and
>>>> compiled that with -s -- outside of the context of the Open MPI code tree)
>>>>
>>>> Here's the new output. It still didn't inline, but you can see the code
>>>> for the _cmpset function:
>>>>
>>>> -----
>>>> [7:13] svbu-mpi:~/tmp % cat george.c
>>>>
>>>> #include <stdint.h>
>>>>
>>>> #include "opal/sys/atomic.h"
>>>>
>>>> int foo(void) {
>>>> int32_t oldval, delta;
>>>> int32_t *addr = 0;
>>>> do {
>>>> oldval = *addr;
>>>> } while (0 == opal_atomic_cmpset_32(addr, oldval, oldval + delta));
>>>> return (oldval + delta);
>>>> }
>>>>
>>>> [7:13] svbu-mpi:~/tmp % pgcc -O3 -I /home/jsquyres/svn/ompi4
>>>> -I/home/jsquyres/svn/ompi4/opal/include -c -s george.c
>>>> [7:13] svbu-mpi:~/tmp % cat george.s
>>>> .file "george.c"
>>>> .version "01.01"
>>>> ## PGC 7.0 -opt 1
>>>> ## PGC 06/08/2010 05:10:04
>>>> ## pgcc george.c -c -S
>>>> ## /opt/pgi/7.0.7/linux86-64/7.0-7/bin/pgc
>>>> ## george.c -opt 1 -terse 1 -inform warn -x 119 0xa10000 -x 122 0x40 -x
>>>> 123 0x1000
>>>> ## -x 127 4 -x 127 16 -x 19 0x400000 -x 28 0x40000 -x 70 0x8000 -x 122 1
>>>> -quad
>>>> ## -x 59 4 -x 59 4 -tp p7-64 -astype 0 -stdinc
>>>> /opt/pgi/7.0.7/linux86-64/7.0-7/include:/usr/local/include:/usr/lib/gcc/x86_64-redhat-linux/4.1.2/include:/usr/lib/gcc/x86_64-redhat-linux/4.1.2/include:/usr/include
>>>> ## -def unix -def __unix -def __unix__ -def linux -def __linux -def
>>>> __linux__
>>>> ## -def __NO_MATH_INLINES -def __x86_64__ -def
>>>> __LONG_MAX__=9223372036854775807L
>>>> ## -def __SIZE_TYPE__=unsigned long int -def __PTRDIFF_TYPE__=long int
>>>> -def __THROW=
>>>> ## -def __extension__= -def __amd64__ -def __SSE__ -def __MMX__ -def
>>>> __SSE2__
>>>> ## -def __SSE3__ -predicate #machine(x86_64) #lint(off) #system(posix)
>>>> #cpu(x86_64)
>>>> ## -cmdline +pgcc george.c -c -S -x 123 4 -x 123 0x80000000 -alwaysinline
>>>> /opt/pgi/7.0.7/linux86-64/7.0-7/lib/libintrinsics.il 4
>>>> ## -asm george.s
>>>> ## lineno: 3
>>>> .text
>>>> .align 16
>>>> .globl foo
>>>> foo:
>>>> ..Dcfb0:
>>>> pushq %rbp
>>>> ..Dcfi0:
>>>> movq %rsp, %rbp
>>>> ..Dcfi1:
>>>> subq $16, %rsp
>>>> ..EN1:
>>>> ## lineno: 5
>>>> movq $0, -8(%rbp)
>>>> .p2align 4,,3
>>>> .LB157:
>>>> ## lineno: 6
>>>> movq -8(%rbp), %rdi
>>>> movl (%rdi), %esi
>>>> movl %esi, -12(%rbp)
>>>> movl -16(%rbp), %edx
>>>> addl %esi, %edx
>>>> xorl %eax, %eax
>>>> call opal_atomic_cmpset_32
>>>> testl %eax, %eax
>>>> je .LB157
>>>> movl -16(%rbp), %eax
>>>> addl -12(%rbp), %eax
>>>> ## lineno: 10
>>>> leave
>>>> ret
>>>> .type foo,@function
>>>> .size foo,.-foo
>>>> ..Dcfe0:
>>>> __fooEND:
>>>> .section .pgi_trace
>>>> .align 8
>>>> .quad foo ## address of routine
>>>> .quad __fooEND - foo ## size of routine
>>>> .2byte 0 ## flags for future use
>>>> .2byte 3 ## length of following string
>>>> ## name:foo:
>>>> .byte 0x66,0x6f,0x6f,0x00
>>>> .data
>>>> .globl opal_atomic_cmpset_32
>>>> .section .debug_frame
>>>> ..Dcieb0:
>>>> .4byte ..Dciee0-..Dcieb0-4 ## CIE length
>>>> .4byte 0xffffffff ## CIE ID
>>>> .byte 0x1 ## CIE version
>>>> .byte 0x0 ## no augmentation
>>>> .byte 0x1 ## ULEB128 1, code alignment factor
>>>> .byte 0x78 ## SLEB128 -8, data alignment factor
>>>> .byte 0x10 ## return address column
>>>> .byte 0xc ## DW_CFA_def_cfa (col 7)
>>>> .byte 0x7 ## ULEB128 7
>>>> .byte 0x8 ## ULEB128 8
>>>> .byte 0x90 ## DW_CFA_offset (col 16)
>>>> .byte 0x1 ## ULEB128 1
>>>> .align 8
>>>> ..Dciee0:
>>>> .4byte ..Dfdee0-..Dfdeb0 ## FDE length
>>>> ..Dfdeb0:
>>>> .4byte ..Dcieb0 ## CIE pointer
>>>> .quad ..Dcfb0 ## initial location
>>>> .quad ..Dcfe0-..Dcfb0 ## address range
>>>> .byte 0x4 ## DW_CFA_advance_loc4
>>>> .4byte ..Dcfi0-..Dcfb0
>>>> .byte 0xe ## DW_CFA_def_cfa_offset
>>>> .byte 0x10 ## ULEB128 16
>>>> .byte 0x86 ## DW_CFA_offset (col 6)
>>>> .byte 0x2 ## ULEB128 2
>>>> .byte 0x4 ## DW_CFA_advance_loc4
>>>> .4byte ..Dcfi1-..Dcfi0
>>>> .byte 0xd ## DW_CFA_def_cfa_register (col 6)
>>>> .byte 0x6 ## ULEB128 6
>>>> .align 8
>>>> ..Dfdee0:
>>>> .ident "PGC 7.0-7"
>>>> [7:13] svbu-mpi:~/tmp %
>>>> -----
>>>>
>>>> --
>>>> Jeff Squyres
>>>> [email protected]
>>>> For corporate legal information go to:
>>>> http://www.cisco.com/web/about/doing_business/legal/cri/
>>>>
>>>>
>>>> _______________________________________________
>>>> devel mailing list
>>>> [email protected]
>>>> http://www.open-mpi.org/mailman/listinfo.cgi/devel
>>>
>>>
>>> _______________________________________________
>>> devel mailing list
>>> [email protected]
>>> http://www.open-mpi.org/mailman/listinfo.cgi/devel
>>>
>>
>>
>> --
>> Jeff Squyres
>> [email protected]
>> For corporate legal information go to:
>> http://www.cisco.com/web/about/doing_business/legal/cri/
>>
>>
>> _______________________________________________
>> devel mailing list
>> [email protected]
>> http://www.open-mpi.org/mailman/listinfo.cgi/devel
>
>
> _______________________________________________
> devel mailing list
> [email protected]
> http://www.open-mpi.org/mailman/listinfo.cgi/devel
--
Jeff Squyres
[email protected]
For corporate legal information go to:
http://www.cisco.com/web/about/doing_business/legal/cri/