Matthew Wilcox wrote:

The original code and your rewrite both access memory twice in the loop.
Why don't we do it with one memory reference per loop instead?

{
        CMPXCHG_BUGCHECK_DECL

        u32 *m = (u32 *)addr + (nr >> 5);
        u32 bit = 1 << (nr & 31);

        u32 old = *m;
        while (!(old & bit)) {
                u32 new = old | bit;
                u32 prev = cmpxchg_acq(m, old, new);
                CMPXCHG_BUGCHECK(m);
                if (prev == old)
                        return 1;
                old = prev;
        }

        return 0;
}

Looking at the disassembly of grab_block() in fs/ext2/balloc.c, I don't
see much difference.  The ld4.acq turns into a regular ld4 (because
'm' is no longer tagged as volatile), and is hoisted out of the loop.
Interestingly, gcc chooses to reorder the tests, and make the loop four
bundles long instead of three, but will 'goto repeat' in two bundles
instead of four.  Using likely()/unlikely() doesn't persuade gcc to
change the order of the two branches, so I assume it actually is better
to do it this way.

I like this code with the following slight modifications:
- let's keep "m" as pointer to volatile
- let's keep on using "__u32" types
- not sure we need "new"
- return the old bit

        volatile __u32 *m = (volatile __u32 *)addr + (nr >> 5);
        __u32 bit = 1 << (nr & 31);

        __u32 old = *m;
        while (!(old & bit)) {
                __u32 prev = cmpxchg_acq(m, old, old | bit);
                CMPXCHG_BUGCHECK(m);
                if (prev == old)
                        return 0;
                old = prev;
        }
        return 1;

It compiles to:

  0:         and r15=31,r32
  6:         mov r14=1
  c:         extr r32=r32,5,27;;
 10:         shladd r18=r32,2,r33;;
 16:         ld4.acq r16=[r18]
 1c:         shl r17=r14,r15;;
 20:         and r14=r17,r16;;
 26:         nop.m 0x0
 2c:         cmp4.eq p7,p6=0,r14
 30:         nop.b 0x0
 36:         nop.b 0x0
 3c:   (p06) br.cond.dpnt.few a0 <+0xa0>
 40:         nop.m 0x0
 46:         zxt4 r14=r16
 4c:         nop.b 0x0;;
 50:         mov.m ar.ccv=r14;;
 56:         nop.m 0x0
 5c:         or r14=r17,r16
 60:         nop.m 0x0;;
 66:         cmpxchg4.acq r14=[r18],r14,ar.ccv
 6c:         nop.i 0x0;;
 70:         cmp4.eq p7,p6=r14,r16
 76:         nop.f 0x0
 7c:         and r15=r17,r14
 80:         mov r16=r14
 86:         nop.f 0x0
 8c:   (p07) br.cond.dpnt.few b0 <+0xb0>;;
 90:         nop.m 0x0
 96:         cmp4.eq p7,p6=0,r15
 9c:   (p07) br.cond.dptk.few 40 <+0x40>
 a0:         nop.m 0x0
 a6:         mov r8=1
 ac:         br.ret.sptk.many b0;;
 b0:         nop.m 0x0
 b6:         mov r8=r0
 bc:         br.ret.sptk.many b0;;

It seems to be o.k., thanks.

Zoltán Menyhárt


-
To unsubscribe from this list: send the line "unsubscribe linux-ia64" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to