On Fri, Mar 24, 2017 at 09:54:46AM -0700, Andy Lutomirski wrote:
> > So the first snipped I tested regressed like so:
> >
> >
> > 0000000000000000 <T_refcount_inc>:                              
> > 0000000000000000 <T_refcount_inc>:
> >    0:   8b 07                   mov    (%rdi),%eax                 0:   8b 
> > 17                   mov    (%rdi),%edx
> >    2:   83 f8 ff                cmp    $0xffffffff,%eax            2:   83 
> > fa ff                cmp    $0xffffffff,%edx
> >    5:   74 13                   je     1a <T_refcount_inc+0x1a>    5:   74 
> > 1a                   je     21 <T_refcount_inc+0x21>
> >    7:   85 c0                   test   %eax,%eax                   7:   85 
> > d2                   test   %edx,%edx
> >    9:   74 0d                   je     18 <T_refcount_inc+0x18>    9:   74 
> > 13                   je     1e <T_refcount_inc+0x1e>
> >    b:   8d 50 01                lea    0x1(%rax),%edx              b:   8d 
> > 4a 01                lea    0x1(%rdx),%ecx
> >    e:   f0 0f b1 17             lock cmpxchg %edx,(%rdi)           e:   89 
> > d0                   mov    %edx,%eax
> >   12:   75 ee                   jne    2 <T_refcount_inc+0x2>     10:   f0 
> > 0f b1 0f             lock cmpxchg %ecx,(%rdi)
> >   14:   ff c2                   inc    %edx                       14:   74 
> > 04                   je     1a <T_refcount_inc+0x1a>
> >   16:   75 02                   jne    1a <T_refcount_inc+0x1a>   16:   89 
> > c2                   mov    %eax,%edx
> >   18:   0f 0b                   ud2                               18:   eb 
> > e8                   jmp    2 <T_refcount_inc+0x2>
> >   1a:   c3                      retq                              1a:   ff 
> > c1                   inc    %ecx
> >                                                                   1c:   75 
> > 03                   jne    21 <T_refcount_inc+0x21>
> >                                                                   1e:   0f 
> > 0b                   ud2
> >                                                                   20:   c3  
> >                     retq
> >                                                                   21:   c3  
> >                     retq
> 
> Can you re-send the better asm you got earlier?

On the left?

> If I pretend to be a dumb compiler, I wonder if you'd get better results with:
> 
> if (!success) {
>   *_old = __old;
>   return false;
> } else {
>   return true;
> }
> 
> or however you jam that into a statement expression.  That way you
> aren't relying on the compiler to merge the branches.

I tried a few variants, but nothing really made it better.

Find the tiny.c file below; I'm using:

  gcc (Debian 6.3.0-5) 6.3.0 20170124

it has both an inline and an stmt-expr try_cmpxchg variant to play with;
the 'expected' output is at the bottom (same as above left).

Note that clang doesn't compile this stuff due to missing features.

---

/* gcc -Os -std=gnu99 -fno-strict-overflow -falign-jumps=1 -falign-loops=1 -c 
tiny.c; objdump -dr tiny.o */

typedef _Bool bool;

static inline bool try_cmpxchg(unsigned int *ptr, unsigned int *val, unsigned 
int new)
{
        unsigned int old = *val;
        bool success;

        asm volatile("lock cmpxchgl %[new], %[ptr]"
                     : "=@ccz" (success),
                       [ptr] "+m" (*ptr),
                       [old] "+a" (old)
                     : [new] "r" (new)
                     : "memory");

//      if (!success)
                *val = old;

        return success;
}

#define __try_cmpxchg(_ptr, _pold, _new)                                \
({                                                                      \
        bool success;                                                   \
        __typeof__(_ptr) _old = (_pold);                                \
        __typeof__(*(_ptr)) __old = *_old;                              \
        __typeof__(*(_ptr)) __new = (_new);                             \
        volatile unsigned int * __ptr = (volatile unsigned int *)(_ptr);\
                                                                        \
        asm volatile("lock cmpxchgl %[new], %[ptr]"                     \
                     : "=@ccz" (success),                               \
                       [ptr] "+m" (*__ptr),                             \
                       [old] "+a" (__old)                               \
                     : [new] "r" (__new)                                \
                     : "memory");                                       \
        /* if (!success) */                                                     
\
                *_old = __old;                                          \
        success;                                                        \
})

#define EXCEPTION_VALUE(val, handler)   asm volatile ("ud2 # %0" : : "r" (val))

#define UINT_MAX        (~0U)

#define likely(x)    __builtin_expect(!!(x), 1)
#define unlikely(x)    __builtin_expect(!!(x), 0)

static inline void refcount_inc(unsigned int *r)
{
        unsigned int new, val = *(unsigned int volatile *)r;

        do {
                if (unlikely(val == UINT_MAX)) /* saturated */
                        return;

                if (unlikely(!val)) /* use-after-free */
                        goto exception;

                /* cannot overflow because we already checked UINT_MAX */
                new = val + 1;

        } while (unlikely(!try_cmpxchg(r, &val, new)));

        if (unlikely(new == UINT_MAX))
exception:      EXCEPTION_VALUE(val, __refcount_warn);
}

void T_refcount_inc(unsigned int *r)
{
        refcount_inc(r);
}

/*
   0:   8b 07                   mov    (%rdi),%eax
   2:   83 f8 ff                cmp    $0xffffffff,%eax
   5:   74 13                   je     1a <T_refcount_inc+0x1a>
   7:   85 c0                   test   %eax,%eax
   9:   74 0d                   je     18 <T_refcount_inc+0x18>
   b:   8d 50 01                lea    0x1(%rax),%edx
   e:   f0 0f b1 17             lock cmpxchg %edx,(%rdi)
  12:   75 ee                   jne    2 <T_refcount_inc+0x2>
  14:   ff c2                   inc    %edx
  16:   75 02                   jne    1a <T_refcount_inc+0x1a>
  18:   0f 0b                   ud2
  1a:   c3                      retq
*/

Reply via email to