-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
Hi Dave and All,
I will not be able to test this patch until later today or tomorrow, as the LOM
port on the netra 1405 I use to test with is connected to a recently retired
server... I'll try to get it moved today, otherwise I'll do it first thing
tomorrow morning.
Dave, thanks for continuing to look into this. As always, anyting I can do to
help I'm glad to offer.
Thanks,
Josh
David S. Miller wrote:
> [ CC:'ing Josh Grebe, he could reproduce these SMP hangs on
> his box quite reliably ]
>
> Ok folks, give this patch a try. It applies cleanly to
> 2.6.12 and 2.6.12.1
>
> [SPARC64]: Avoid membar instructions in delay slots.
>
> In particular, avoid membar instructions in the delay
> slot of a jmpl instruction.
>
> UltraSPARC-I, II, IIi, and IIe have a bug, documented in
> the UltraSPARC-IIi User's Manual, Appendix K, Erratum 51
>
> The long and short of it is that if the IMU unit misses
> on a branch or jmpl, and there is a store buffer synchronizing
> membar in the delay slot, the chip can stop fetching instructions.
>
> If interrupts are enabled or some other trap is enabled, the
> chip will unwedge itself, but performance will suffer.
>
> We already had a workaround for this bug in a few spots, but
> it's better to have the entire tree sanitized for this rule.
>
> Signed-off-by: David S. Miller <[EMAIL PROTECTED]>
>
> diff --git a/arch/sparc64/kernel/entry.S b/arch/sparc64/kernel/entry.S
> --- a/arch/sparc64/kernel/entry.S
> +++ b/arch/sparc64/kernel/entry.S
> @@ -271,8 +271,9 @@ cplus_fptrap_insn_1:
> fmuld %f0, %f2, %f26
> faddd %f0, %f2, %f28
> fmuld %f0, %f2, %f30
> + membar #Sync
> b,pt %xcc, fpdis_exit
> - membar #Sync
> + nop
> 2: andcc %g5, FPRS_DU, %g0
> bne,pt %icc, 3f
> fzero %f32
> @@ -301,8 +302,9 @@ cplus_fptrap_insn_2:
> fmuld %f32, %f34, %f58
> faddd %f32, %f34, %f60
> fmuld %f32, %f34, %f62
> + membar #Sync
> ba,pt %xcc, fpdis_exit
> - membar #Sync
> + nop
> 3: mov SECONDARY_CONTEXT, %g3
> add %g6, TI_FPREGS, %g1
> ldxa [%g3] ASI_DMMU, %g5
> diff --git a/arch/sparc64/kernel/semaphore.c b/arch/sparc64/kernel/semaphore.c
> --- a/arch/sparc64/kernel/semaphore.c
> +++ b/arch/sparc64/kernel/semaphore.c
> @@ -32,8 +32,9 @@ static __inline__ int __sem_update_count
> " add %1, %4, %1\n"
> " cas [%3], %0, %1\n"
> " cmp %0, %1\n"
> +" membar #StoreLoad | #StoreStore\n"
> " bne,pn %%icc, 1b\n"
> -" membar #StoreLoad | #StoreStore\n"
> +" nop\n"
> : "=&r" (old_count), "=&r" (tmp), "=m" (sem->count)
> : "r" (&sem->count), "r" (incr), "m" (sem->count)
> : "cc");
> @@ -71,8 +72,9 @@ void up(struct semaphore *sem)
> " cmp %%g1, %%g7\n"
> " bne,pn %%icc, 1b\n"
> " addcc %%g7, 1, %%g0\n"
> +" membar #StoreLoad | #StoreStore\n"
> " ble,pn %%icc, 3f\n"
> -" membar #StoreLoad | #StoreStore\n"
> +" nop\n"
> "2:\n"
> " .subsection 2\n"
> "3: mov %0, %%g1\n"
> @@ -128,8 +130,9 @@ void __sched down(struct semaphore *sem)
> " cmp %%g1, %%g7\n"
> " bne,pn %%icc, 1b\n"
> " cmp %%g7, 1\n"
> +" membar #StoreLoad | #StoreStore\n"
> " bl,pn %%icc, 3f\n"
> -" membar #StoreLoad | #StoreStore\n"
> +" nop\n"
> "2:\n"
> " .subsection 2\n"
> "3: mov %0, %%g1\n"
> @@ -233,8 +236,9 @@ int __sched down_interruptible(struct se
> " cmp %%g1, %%g7\n"
> " bne,pn %%icc, 1b\n"
> " cmp %%g7, 1\n"
> +" membar #StoreLoad | #StoreStore\n"
> " bl,pn %%icc, 3f\n"
> -" membar #StoreLoad | #StoreStore\n"
> +" nop\n"
> "2:\n"
> " .subsection 2\n"
> "3: mov %2, %%g1\n"
> diff --git a/arch/sparc64/kernel/trampoline.S
> b/arch/sparc64/kernel/trampoline.S
> --- a/arch/sparc64/kernel/trampoline.S
> +++ b/arch/sparc64/kernel/trampoline.S
> @@ -98,8 +98,9 @@ startup_continue:
>
> sethi %hi(prom_entry_lock), %g2
> 1: ldstub [%g2 + %lo(prom_entry_lock)], %g1
> + membar #StoreLoad | #StoreStore
> brnz,pn %g1, 1b
> - membar #StoreLoad | #StoreStore
> + nop
>
> sethi %hi(p1275buf), %g2
> or %g2, %lo(p1275buf), %g2
> diff --git a/arch/sparc64/lib/U1memcpy.S b/arch/sparc64/lib/U1memcpy.S
> --- a/arch/sparc64/lib/U1memcpy.S
> +++ b/arch/sparc64/lib/U1memcpy.S
> @@ -87,14 +87,17 @@
> #define LOOP_CHUNK3(src, dest, len, branch_dest) \
> MAIN_LOOP_CHUNK(src, dest, f32, f48, len, branch_dest)
>
> +#define DO_SYNC membar #Sync;
> #define STORE_SYNC(dest, fsrc) \
> EX_ST(STORE_BLK(%fsrc, %dest)); \
> - add %dest, 0x40, %dest;
> + add %dest, 0x40, %dest; \
> + DO_SYNC
>
> #define STORE_JUMP(dest, fsrc, target) \
> EX_ST(STORE_BLK(%fsrc, %dest)); \
> add %dest, 0x40, %dest; \
> - ba,pt %xcc, target;
> + ba,pt %xcc, target; \
> + nop;
>
> #define FINISH_VISCHUNK(dest, f0, f1, left) \
> subcc %left, 8, %left;\
> @@ -239,17 +242,17 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len
> ba,pt %xcc, 1b+4
> faligndata %f0, %f2, %f48
> 1: FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)
> - STORE_JUMP(o0, f48, 40f) membar #Sync
> + STORE_JUMP(o0, f48, 40f)
> 2: FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)
> - STORE_JUMP(o0, f48, 48f) membar #Sync
> + STORE_JUMP(o0, f48, 48f)
> 3: FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)
> - STORE_JUMP(o0, f48, 56f) membar #Sync
> + STORE_JUMP(o0, f48, 56f)
>
> 1: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)
> LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
> @@ -260,17 +263,17 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len
> ba,pt %xcc, 1b+4
> faligndata %f2, %f4, %f48
> 1: FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)
> - STORE_JUMP(o0, f48, 41f) membar #Sync
> + STORE_JUMP(o0, f48, 41f)
> 2: FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)
> - STORE_JUMP(o0, f48, 49f) membar #Sync
> + STORE_JUMP(o0, f48, 49f)
> 3: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)
> - STORE_JUMP(o0, f48, 57f) membar #Sync
> + STORE_JUMP(o0, f48, 57f)
>
> 1: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)
> LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
> @@ -281,17 +284,17 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len
> ba,pt %xcc, 1b+4
> faligndata %f4, %f6, %f48
> 1: FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)
> - STORE_JUMP(o0, f48, 42f) membar #Sync
> + STORE_JUMP(o0, f48, 42f)
> 2: FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)
> - STORE_JUMP(o0, f48, 50f) membar #Sync
> + STORE_JUMP(o0, f48, 50f)
> 3: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)
> - STORE_JUMP(o0, f48, 58f) membar #Sync
> + STORE_JUMP(o0, f48, 58f)
>
> 1: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)
> LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
> @@ -302,17 +305,17 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len
> ba,pt %xcc, 1b+4
> faligndata %f6, %f8, %f48
> 1: FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6)
> - STORE_JUMP(o0, f48, 43f) membar #Sync
> + STORE_JUMP(o0, f48, 43f)
> 2: FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)
> - STORE_JUMP(o0, f48, 51f) membar #Sync
> + STORE_JUMP(o0, f48, 51f)
> 3: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)
> - STORE_JUMP(o0, f48, 59f) membar #Sync
> + STORE_JUMP(o0, f48, 59f)
>
> 1: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)
> LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
> @@ -323,17 +326,17 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len
> ba,pt %xcc, 1b+4
> faligndata %f8, %f10, %f48
> 1: FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)
> - STORE_JUMP(o0, f48, 44f) membar #Sync
> + STORE_JUMP(o0, f48, 44f)
> 2: FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)
> - STORE_JUMP(o0, f48, 52f) membar #Sync
> + STORE_JUMP(o0, f48, 52f)
> 3: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)
> - STORE_JUMP(o0, f48, 60f) membar #Sync
> + STORE_JUMP(o0, f48, 60f)
>
> 1: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)
> LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
> @@ -344,17 +347,17 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len
> ba,pt %xcc, 1b+4
> faligndata %f10, %f12, %f48
> 1: FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)
> - STORE_JUMP(o0, f48, 45f) membar #Sync
> + STORE_JUMP(o0, f48, 45f)
> 2: FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)
> - STORE_JUMP(o0, f48, 53f) membar #Sync
> + STORE_JUMP(o0, f48, 53f)
> 3: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)
> - STORE_JUMP(o0, f48, 61f) membar #Sync
> + STORE_JUMP(o0, f48, 61f)
>
> 1: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)
> LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
> @@ -365,17 +368,17 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len
> ba,pt %xcc, 1b+4
> faligndata %f12, %f14, %f48
> 1: FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)
> - STORE_JUMP(o0, f48, 46f) membar #Sync
> + STORE_JUMP(o0, f48, 46f)
> 2: FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)
> - STORE_JUMP(o0, f48, 54f) membar #Sync
> + STORE_JUMP(o0, f48, 54f)
> 3: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)
> - STORE_JUMP(o0, f48, 62f) membar #Sync
> + STORE_JUMP(o0, f48, 62f)
>
> 1: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)
> LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
> @@ -386,17 +389,17 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len
> ba,pt %xcc, 1b+4
> faligndata %f14, %f16, %f48
> 1: FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)
> - STORE_JUMP(o0, f48, 47f) membar #Sync
> + STORE_JUMP(o0, f48, 47f)
> 2: FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)
> - STORE_JUMP(o0, f48, 55f) membar #Sync
> + STORE_JUMP(o0, f48, 55f)
> 3: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)
> - STORE_JUMP(o0, f48, 63f) membar #Sync
> + STORE_JUMP(o0, f48, 63f)
>
> 40: FINISH_VISCHUNK(o0, f0, f2, g3)
> 41: FINISH_VISCHUNK(o0, f2, f4, g3)
> diff --git a/arch/sparc64/lib/VISsave.S b/arch/sparc64/lib/VISsave.S
> --- a/arch/sparc64/lib/VISsave.S
> +++ b/arch/sparc64/lib/VISsave.S
> @@ -72,7 +72,11 @@ vis1: ldub [%g6 + TI_FPSAVED], %g3
>
> stda %f48, [%g3 + %g1] ASI_BLK_P
> 5: membar #Sync
> - jmpl %g7 + %g0, %g0
> + ba,pt %xcc, 80f
> + nop
> +
> + .align 32
> +80: jmpl %g7 + %g0, %g0
> nop
>
> 6: ldub [%g3 + TI_FPSAVED], %o5
> @@ -87,8 +91,11 @@ vis1: ldub [%g6 + TI_FPSAVED], %g3
> stda %f32, [%g2 + %g1] ASI_BLK_P
> stda %f48, [%g3 + %g1] ASI_BLK_P
> membar #Sync
> - jmpl %g7 + %g0, %g0
> + ba,pt %xcc, 80f
> + nop
>
> + .align 32
> +80: jmpl %g7 + %g0, %g0
> nop
>
> .align 32
> @@ -126,6 +133,10 @@ VISenterhalf:
> stda %f0, [%g2 + %g1] ASI_BLK_P
> stda %f16, [%g3 + %g1] ASI_BLK_P
> membar #Sync
> + ba,pt %xcc, 4f
> + nop
> +
> + .align 32
> 4: and %o5, FPRS_DU, %o5
> jmpl %g7 + %g0, %g0
> wr %o5, FPRS_FEF, %fprs
> diff --git a/arch/sparc64/lib/atomic.S b/arch/sparc64/lib/atomic.S
> --- a/arch/sparc64/lib/atomic.S
> +++ b/arch/sparc64/lib/atomic.S
> @@ -7,18 +7,6 @@
> #include <linux/config.h>
> #include <asm/asi.h>
>
> - /* On SMP we need to use memory barriers to ensure
> - * correct memory operation ordering, nop these out
> - * for uniprocessor.
> - */
> -#ifdef CONFIG_SMP
> -#define ATOMIC_PRE_BARRIER membar #StoreLoad | #LoadLoad
> -#define ATOMIC_POST_BARRIER membar #StoreLoad | #StoreStore
> -#else
> -#define ATOMIC_PRE_BARRIER nop
> -#define ATOMIC_POST_BARRIER nop
> -#endif
> -
> .text
>
> /* Two versions of the atomic routines, one that
> @@ -52,6 +40,24 @@ atomic_sub: /* %o0 = decrement, %o1 = at
> nop
> .size atomic_sub, .-atomic_sub
>
> + /* On SMP we need to use memory barriers to ensure
> + * correct memory operation ordering, nop these out
> + * for uniprocessor.
> + */
> +#ifdef CONFIG_SMP
> +
> +#define ATOMIC_PRE_BARRIER membar #StoreLoad | #LoadLoad;
> +#define ATOMIC_POST_BARRIER \
> + ba,pt %xcc, 80b; \
> + membar #StoreLoad | #StoreStore
> +
> +80: retl
> + nop
> +#else
> +#define ATOMIC_PRE_BARRIER
> +#define ATOMIC_POST_BARRIER
> +#endif
> +
> .globl atomic_add_ret
> .type atomic_add_ret,#function
> atomic_add_ret: /* %o0 = increment, %o1 = atomic_ptr */
> @@ -62,9 +68,10 @@ atomic_add_ret: /* %o0 = increment, %o1
> cmp %g1, %g7
> bne,pn %icc, 1b
> add %g7, %o0, %g7
> + sra %g7, 0, %o0
> ATOMIC_POST_BARRIER
> retl
> - sra %g7, 0, %o0
> + nop
> .size atomic_add_ret, .-atomic_add_ret
>
> .globl atomic_sub_ret
> @@ -77,9 +84,10 @@ atomic_sub_ret: /* %o0 = decrement, %o1
> cmp %g1, %g7
> bne,pn %icc, 1b
> sub %g7, %o0, %g7
> + sra %g7, 0, %o0
> ATOMIC_POST_BARRIER
> retl
> - sra %g7, 0, %o0
> + nop
> .size atomic_sub_ret, .-atomic_sub_ret
>
> .globl atomic64_add
> @@ -118,9 +126,10 @@ atomic64_add_ret: /* %o0 = increment, %o
> cmp %g1, %g7
> bne,pn %xcc, 1b
> add %g7, %o0, %g7
> + mov %g7, %o0
> ATOMIC_POST_BARRIER
> retl
> - mov %g7, %o0
> + nop
> .size atomic64_add_ret, .-atomic64_add_ret
>
> .globl atomic64_sub_ret
> @@ -133,7 +142,8 @@ atomic64_sub_ret: /* %o0 = decrement, %o
> cmp %g1, %g7
> bne,pn %xcc, 1b
> sub %g7, %o0, %g7
> + mov %g7, %o0
> ATOMIC_POST_BARRIER
> retl
> - mov %g7, %o0
> + nop
> .size atomic64_sub_ret, .-atomic64_sub_ret
> diff --git a/arch/sparc64/lib/bitops.S b/arch/sparc64/lib/bitops.S
> --- a/arch/sparc64/lib/bitops.S
> +++ b/arch/sparc64/lib/bitops.S
> @@ -7,20 +7,26 @@
> #include <linux/config.h>
> #include <asm/asi.h>
>
> + .text
> +
> /* On SMP we need to use memory barriers to ensure
> * correct memory operation ordering, nop these out
> * for uniprocessor.
> */
> +
> #ifdef CONFIG_SMP
> #define BITOP_PRE_BARRIER membar #StoreLoad | #LoadLoad
> -#define BITOP_POST_BARRIER membar #StoreLoad | #StoreStore
> +#define BITOP_POST_BARRIER \
> + ba,pt %xcc, 80b; \
> + membar #StoreLoad | #StoreStore
> +
> +80: retl
> + nop
> #else
> -#define BITOP_PRE_BARRIER nop
> -#define BITOP_POST_BARRIER nop
> +#define BITOP_PRE_BARRIER
> +#define BITOP_POST_BARRIER
> #endif
>
> - .text
> -
> .globl test_and_set_bit
> .type test_and_set_bit,#function
> test_and_set_bit: /* %o0=nr, %o1=addr */
> @@ -37,10 +43,11 @@ test_and_set_bit: /* %o0=nr, %o1=addr */
> cmp %g7, %g1
> bne,pn %xcc, 1b
> and %g7, %o2, %g2
> - BITOP_POST_BARRIER
> clr %o0
> + movrne %g2, 1, %o0
> + BITOP_POST_BARRIER
> retl
> - movrne %g2, 1, %o0
> + nop
> .size test_and_set_bit, .-test_and_set_bit
>
> .globl test_and_clear_bit
> @@ -59,10 +66,11 @@ test_and_clear_bit: /* %o0=nr, %o1=addr
> cmp %g7, %g1
> bne,pn %xcc, 1b
> and %g7, %o2, %g2
> - BITOP_POST_BARRIER
> clr %o0
> + movrne %g2, 1, %o0
> + BITOP_POST_BARRIER
> retl
> - movrne %g2, 1, %o0
> + nop
> .size test_and_clear_bit, .-test_and_clear_bit
>
> .globl test_and_change_bit
> @@ -81,10 +89,11 @@ test_and_change_bit: /* %o0=nr, %o1=addr
> cmp %g7, %g1
> bne,pn %xcc, 1b
> and %g7, %o2, %g2
> - BITOP_POST_BARRIER
> clr %o0
> + movrne %g2, 1, %o0
> + BITOP_POST_BARRIER
> retl
> - movrne %g2, 1, %o0
> + nop
> .size test_and_change_bit, .-test_and_change_bit
>
> .globl set_bit
> diff --git a/arch/sparc64/lib/debuglocks.c b/arch/sparc64/lib/debuglocks.c
> --- a/arch/sparc64/lib/debuglocks.c
> +++ b/arch/sparc64/lib/debuglocks.c
> @@ -252,8 +252,9 @@ wlock_again:
> " andn %%g1, %%g3, %%g7\n"
> " casx [%0], %%g1, %%g7\n"
> " cmp %%g1, %%g7\n"
> +" membar #StoreLoad | #StoreStore\n"
> " bne,pn %%xcc, 1b\n"
> -" membar #StoreLoad | #StoreStore"
> +" nop"
> : /* no outputs */
> : "r" (&(rw->lock))
> : "g3", "g1", "g7", "cc", "memory");
> @@ -351,8 +352,9 @@ int _do_write_trylock (rwlock_t *rw, cha
> " andn %%g1, %%g3, %%g7\n"
> " casx [%0], %%g1, %%g7\n"
> " cmp %%g1, %%g7\n"
> +" membar #StoreLoad | #StoreStore\n"
> " bne,pn %%xcc, 1b\n"
> -" membar #StoreLoad | #StoreStore"
> +" nop"
> : /* no outputs */
> : "r" (&(rw->lock))
> : "g3", "g1", "g7", "cc", "memory");
> diff --git a/arch/sparc64/lib/dec_and_lock.S b/arch/sparc64/lib/dec_and_lock.S
> --- a/arch/sparc64/lib/dec_and_lock.S
> +++ b/arch/sparc64/lib/dec_and_lock.S
> @@ -48,8 +48,9 @@ start_to_zero:
> #endif
> to_zero:
> ldstub [%o1], %g3
> + membar #StoreLoad | #StoreStore
> brnz,pn %g3, spin_on_lock
> - membar #StoreLoad | #StoreStore
> + nop
> loop2: cas [%o0], %g2, %g7 /* ASSERT(g7 == 0) */
> cmp %g2, %g7
>
> @@ -71,8 +72,9 @@ loop2: cas [%o0], %g2, %g7 /* ASSERT(g7
> nop
> spin_on_lock:
> ldub [%o1], %g3
> + membar #LoadLoad
> brnz,pt %g3, spin_on_lock
> - membar #LoadLoad
> + nop
> ba,pt %xcc, to_zero
> nop
> nop
> diff --git a/arch/sparc64/lib/rwsem.S b/arch/sparc64/lib/rwsem.S
> --- a/arch/sparc64/lib/rwsem.S
> +++ b/arch/sparc64/lib/rwsem.S
> @@ -17,8 +17,9 @@ __down_read:
> bne,pn %icc, 1b
> add %g7, 1, %g7
> cmp %g7, 0
> + membar #StoreLoad | #StoreStore
> bl,pn %icc, 3f
> - membar #StoreLoad | #StoreStore
> + nop
> 2:
> retl
> nop
> @@ -57,8 +58,9 @@ __down_write:
> cmp %g3, %g7
> bne,pn %icc, 1b
> cmp %g7, 0
> + membar #StoreLoad | #StoreStore
> bne,pn %icc, 3f
> - membar #StoreLoad | #StoreStore
> + nop
> 2: retl
> nop
> 3:
> @@ -97,8 +99,9 @@ __up_read:
> cmp %g1, %g7
> bne,pn %icc, 1b
> cmp %g7, 0
> + membar #StoreLoad | #StoreStore
> bl,pn %icc, 3f
> - membar #StoreLoad | #StoreStore
> + nop
> 2: retl
> nop
> 3: sethi %hi(RWSEM_ACTIVE_MASK), %g1
> @@ -126,8 +129,9 @@ __up_write:
> bne,pn %icc, 1b
> sub %g7, %g1, %g7
> cmp %g7, 0
> + membar #StoreLoad | #StoreStore
> bl,pn %icc, 3f
> - membar #StoreLoad | #StoreStore
> + nop
> 2:
> retl
> nop
> @@ -151,8 +155,9 @@ __downgrade_write:
> bne,pn %icc, 1b
> sub %g7, %g1, %g7
> cmp %g7, 0
> + membar #StoreLoad | #StoreStore
> bl,pn %icc, 3f
> - membar #StoreLoad | #StoreStore
> + nop
> 2:
> retl
> nop
> diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c
> --- a/arch/sparc64/mm/init.c
> +++ b/arch/sparc64/mm/init.c
> @@ -136,8 +136,9 @@ static __inline__ void set_dcache_dirty(
> "or %%g1, %0, %%g1\n\t"
> "casx [%2], %%g7, %%g1\n\t"
> "cmp %%g7, %%g1\n\t"
> + "membar #StoreLoad | #StoreStore\n\t"
> "bne,pn %%xcc, 1b\n\t"
> - " membar #StoreLoad | #StoreStore"
> + " nop"
> : /* no outputs */
> : "r" (mask), "r" (non_cpu_bits), "r"
> (&page->flags)
> : "g1", "g7");
> @@ -157,8 +158,9 @@ static __inline__ void clear_dcache_dirt
> " andn %%g7, %1, %%g1\n\t"
> "casx [%2], %%g7, %%g1\n\t"
> "cmp %%g7, %%g1\n\t"
> + "membar #StoreLoad | #StoreStore\n\t"
> "bne,pn %%xcc, 1b\n\t"
> - " membar #StoreLoad | #StoreStore\n"
> + " nop\n"
> "2:"
> : /* no outputs */
> : "r" (cpu), "r" (mask), "r" (&page->flags),
> diff --git a/arch/sparc64/mm/ultra.S b/arch/sparc64/mm/ultra.S
> --- a/arch/sparc64/mm/ultra.S
> +++ b/arch/sparc64/mm/ultra.S
> @@ -266,8 +266,9 @@ __cheetah_flush_tlb_pending: /* 22 insns
> andn %o3, 1, %o3
> stxa %g0, [%o3] ASI_IMMU_DEMAP
> 2: stxa %g0, [%o3] ASI_DMMU_DEMAP
> + membar #Sync
> brnz,pt %o1, 1b
> - membar #Sync
> + nop
> stxa %g2, [%o4] ASI_DMMU
> flush %g6
> wrpr %g0, 0, %tl
> diff --git a/include/asm-sparc64/rwsem.h b/include/asm-sparc64/rwsem.h
> --- a/include/asm-sparc64/rwsem.h
> +++ b/include/asm-sparc64/rwsem.h
> @@ -55,8 +55,9 @@ static __inline__ int rwsem_atomic_updat
> "add %%g1, %1, %%g7\n\t"
> "cas [%2], %%g1, %%g7\n\t"
> "cmp %%g1, %%g7\n\t"
> + "membar #StoreLoad | #StoreStore\n\t"
> "bne,pn %%icc, 1b\n\t"
> - " membar #StoreLoad | #StoreStore\n\t"
> + " nop\n\t"
> "mov %%g7, %0\n\t"
> : "=&r" (tmp)
> : "0" (tmp), "r" (sem)
> diff --git a/include/asm-sparc64/spinlock.h b/include/asm-sparc64/spinlock.h
> --- a/include/asm-sparc64/spinlock.h
> +++ b/include/asm-sparc64/spinlock.h
> @@ -52,12 +52,14 @@ static inline void _raw_spin_lock(spinlo
>
> __asm__ __volatile__(
> "1: ldstub [%1], %0\n"
> +" membar #StoreLoad | #StoreStore\n"
> " brnz,pn %0, 2f\n"
> -" membar #StoreLoad | #StoreStore\n"
> +" nop\n"
> " .subsection 2\n"
> "2: ldub [%1], %0\n"
> +" membar #LoadLoad\n"
> " brnz,pt %0, 2b\n"
> -" membar #LoadLoad\n"
> +" nop\n"
> " ba,a,pt %%xcc, 1b\n"
> " .previous"
> : "=&r" (tmp)
> @@ -95,16 +97,18 @@ static inline void _raw_spin_lock_flags(
>
> __asm__ __volatile__(
> "1: ldstub [%2], %0\n"
> -" brnz,pn %0, 2f\n"
> " membar #StoreLoad | #StoreStore\n"
> +" brnz,pn %0, 2f\n"
> +" nop\n"
> " .subsection 2\n"
> "2: rdpr %%pil, %1\n"
> " wrpr %3, %%pil\n"
> "3: ldub [%2], %0\n"
> -" brnz,pt %0, 3b\n"
> " membar #LoadLoad\n"
> +" brnz,pt %0, 3b\n"
> +" nop\n"
> " ba,pt %%xcc, 1b\n"
> -" wrpr %1, %%pil\n"
> +" wrpr %1, %%pil\n"
> " .previous"
> : "=&r" (tmp1), "=&r" (tmp2)
> : "r"(lock), "r"(flags)
> @@ -162,12 +166,14 @@ static void inline __read_lock(rwlock_t
> "4: add %0, 1, %1\n"
> " cas [%2], %0, %1\n"
> " cmp %0, %1\n"
> +" membar #StoreLoad | #StoreStore\n"
> " bne,pn %%icc, 1b\n"
> -" membar #StoreLoad | #StoreStore\n"
> +" nop\n"
> " .subsection 2\n"
> "2: ldsw [%2], %0\n"
> +" membar #LoadLoad\n"
> " brlz,pt %0, 2b\n"
> -" membar #LoadLoad\n"
> +" nop\n"
> " ba,a,pt %%xcc, 4b\n"
> " .previous"
> : "=&r" (tmp1), "=&r" (tmp2)
> @@ -204,12 +210,14 @@ static void inline __write_lock(rwlock_t
> "4: or %0, %3, %1\n"
> " cas [%2], %0, %1\n"
> " cmp %0, %1\n"
> +" membar #StoreLoad | #StoreStore\n"
> " bne,pn %%icc, 1b\n"
> -" membar #StoreLoad | #StoreStore\n"
> +" nop\n"
> " .subsection 2\n"
> "2: lduw [%2], %0\n"
> +" membar #LoadLoad\n"
> " brnz,pt %0, 2b\n"
> -" membar #LoadLoad\n"
> +" nop\n"
> " ba,a,pt %%xcc, 4b\n"
> " .previous"
> : "=&r" (tmp1), "=&r" (tmp2)
> @@ -240,8 +248,9 @@ static int inline __write_trylock(rwlock
> " or %0, %4, %1\n"
> " cas [%3], %0, %1\n"
> " cmp %0, %1\n"
> +" membar #StoreLoad | #StoreStore\n"
> " bne,pn %%icc, 1b\n"
> -" membar #StoreLoad | #StoreStore\n"
> +" nop\n"
> " mov 1, %2\n"
> "2:"
> : "=&r" (tmp1), "=&r" (tmp2), "=&r" (result)
> diff --git a/include/asm-sparc64/spitfire.h b/include/asm-sparc64/spitfire.h
> --- a/include/asm-sparc64/spitfire.h
> +++ b/include/asm-sparc64/spitfire.h
> @@ -111,7 +111,6 @@ static __inline__ void spitfire_put_dcac
> "membar #Sync"
> : /* No outputs */
> : "r" (tag), "r" (addr), "i" (ASI_DCACHE_TAG));
> - __asm__ __volatile__ ("membar #Sync" : : : "memory");
> }
>
> /* The instruction cache lines are flushed with this, but note that
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.1 (GNU/Linux)
Comment: Using GnuPG with Thunderbird - http://enigmail.mozdev.org
iD8DBQFCwDueFAhB33r2ACYRAmcpAJ9ZJsS/tymHUBWuXgPzWUimK6Xv4gCdF0n3
cQBeEJPGePB1Uw0MwJ5sUpg=
=f/TR
-----END PGP SIGNATURE-----
--
[email protected] mailing list