-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Hi Dave and All,

I will not be able to test this patch until later today or tomorrow, as the LOM
port on the netra 1405 I use to test with is connected to a recently retired
server... I'll try to get it moved today, otherwise I'll do it first thing
tomorrow morning.

Dave, thanks for continuing to look into this. As always, anyting I can do to
help I'm glad to offer.

Thanks,

Josh


David S. Miller wrote:
> [ CC:'ing Josh Grebe, he could reproduce these SMP hangs on
>   his box quite reliably ]
> 
> Ok folks, give this patch a try.  It applies cleanly to
> 2.6.12 and 2.6.12.1
> 
> [SPARC64]: Avoid membar instructions in delay slots.
> 
> In particular, avoid membar instructions in the delay
> slot of a jmpl instruction.
> 
> UltraSPARC-I, II, IIi, and IIe have a bug, documented in
> the UltraSPARC-IIi User's Manual, Appendix K, Erratum 51
> 
> The long and short of it is that if the IMU unit misses
> on a branch or jmpl, and there is a store buffer synchronizing
> membar in the delay slot, the chip can stop fetching instructions.
> 
> If interrupts are enabled or some other trap is enabled, the
> chip will unwedge itself, but performance will suffer.
> 
> We already had a workaround for this bug in a few spots, but
> it's better to have the entire tree sanitized for this rule.
> 
> Signed-off-by: David S. Miller <[EMAIL PROTECTED]>
> 
> diff --git a/arch/sparc64/kernel/entry.S b/arch/sparc64/kernel/entry.S
> --- a/arch/sparc64/kernel/entry.S
> +++ b/arch/sparc64/kernel/entry.S
> @@ -271,8 +271,9 @@ cplus_fptrap_insn_1:
>       fmuld           %f0, %f2, %f26
>       faddd           %f0, %f2, %f28
>       fmuld           %f0, %f2, %f30
> +     membar          #Sync
>       b,pt            %xcc, fpdis_exit
> -      membar         #Sync
> +      nop
>  2:   andcc           %g5, FPRS_DU, %g0
>       bne,pt          %icc, 3f
>        fzero          %f32
> @@ -301,8 +302,9 @@ cplus_fptrap_insn_2:
>       fmuld           %f32, %f34, %f58
>       faddd           %f32, %f34, %f60
>       fmuld           %f32, %f34, %f62
> +     membar          #Sync
>       ba,pt           %xcc, fpdis_exit
> -      membar         #Sync
> +      nop
>  3:   mov             SECONDARY_CONTEXT, %g3
>       add             %g6, TI_FPREGS, %g1
>       ldxa            [%g3] ASI_DMMU, %g5
> diff --git a/arch/sparc64/kernel/semaphore.c b/arch/sparc64/kernel/semaphore.c
> --- a/arch/sparc64/kernel/semaphore.c
> +++ b/arch/sparc64/kernel/semaphore.c
> @@ -32,8 +32,9 @@ static __inline__ int __sem_update_count
>  "    add     %1, %4, %1\n"
>  "    cas     [%3], %0, %1\n"
>  "    cmp     %0, %1\n"
> +"    membar  #StoreLoad | #StoreStore\n"
>  "    bne,pn  %%icc, 1b\n"
> -"     membar #StoreLoad | #StoreStore\n"
> +"     nop\n"
>       : "=&r" (old_count), "=&r" (tmp), "=m" (sem->count)
>       : "r" (&sem->count), "r" (incr), "m" (sem->count)
>       : "cc");
> @@ -71,8 +72,9 @@ void up(struct semaphore *sem)
>  "    cmp     %%g1, %%g7\n"
>  "    bne,pn  %%icc, 1b\n"
>  "     addcc  %%g7, 1, %%g0\n"
> +"    membar  #StoreLoad | #StoreStore\n"
>  "    ble,pn  %%icc, 3f\n"
> -"     membar #StoreLoad | #StoreStore\n"
> +"     nop\n"
>  "2:\n"
>  "    .subsection 2\n"
>  "3:  mov     %0, %%g1\n"
> @@ -128,8 +130,9 @@ void __sched down(struct semaphore *sem)
>  "    cmp     %%g1, %%g7\n"
>  "    bne,pn  %%icc, 1b\n"
>  "     cmp    %%g7, 1\n"
> +"    membar  #StoreLoad | #StoreStore\n"
>  "    bl,pn   %%icc, 3f\n"
> -"     membar #StoreLoad | #StoreStore\n"
> +"     nop\n"
>  "2:\n"
>  "    .subsection 2\n"
>  "3:  mov     %0, %%g1\n"
> @@ -233,8 +236,9 @@ int __sched down_interruptible(struct se
>  "    cmp     %%g1, %%g7\n"
>  "    bne,pn  %%icc, 1b\n"
>  "     cmp    %%g7, 1\n"
> +"    membar  #StoreLoad | #StoreStore\n"
>  "    bl,pn   %%icc, 3f\n"
> -"     membar #StoreLoad | #StoreStore\n"
> +"     nop\n"
>  "2:\n"
>  "    .subsection 2\n"
>  "3:  mov     %2, %%g1\n"
> diff --git a/arch/sparc64/kernel/trampoline.S 
> b/arch/sparc64/kernel/trampoline.S
> --- a/arch/sparc64/kernel/trampoline.S
> +++ b/arch/sparc64/kernel/trampoline.S
> @@ -98,8 +98,9 @@ startup_continue:
>  
>       sethi           %hi(prom_entry_lock), %g2
>  1:   ldstub          [%g2 + %lo(prom_entry_lock)], %g1
> +     membar          #StoreLoad | #StoreStore
>       brnz,pn         %g1, 1b
> -      membar         #StoreLoad | #StoreStore
> +      nop
>  
>       sethi           %hi(p1275buf), %g2
>       or              %g2, %lo(p1275buf), %g2
> diff --git a/arch/sparc64/lib/U1memcpy.S b/arch/sparc64/lib/U1memcpy.S
> --- a/arch/sparc64/lib/U1memcpy.S
> +++ b/arch/sparc64/lib/U1memcpy.S
> @@ -87,14 +87,17 @@
>  #define LOOP_CHUNK3(src, dest, len, branch_dest)             \
>       MAIN_LOOP_CHUNK(src, dest, f32, f48, len, branch_dest)
>  
> +#define DO_SYNC                      membar  #Sync;
>  #define STORE_SYNC(dest, fsrc)                               \
>       EX_ST(STORE_BLK(%fsrc, %dest));                 \
> -     add                     %dest, 0x40, %dest;
> +     add                     %dest, 0x40, %dest;     \
> +     DO_SYNC
>  
>  #define STORE_JUMP(dest, fsrc, target)                       \
>       EX_ST(STORE_BLK(%fsrc, %dest));                 \
>       add                     %dest, 0x40, %dest;     \
> -     ba,pt                   %xcc, target;
> +     ba,pt                   %xcc, target;           \
> +      nop;
>  
>  #define FINISH_VISCHUNK(dest, f0, f1, left)  \
>       subcc                   %left, 8, %left;\
> @@ -239,17 +242,17 @@ FUNC_NAME:              /* %o0=dst, %o1=src, %o2=len
>       ba,pt           %xcc, 1b+4
>        faligndata     %f0, %f2, %f48
>  1:   FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)
> -     STORE_SYNC(o0, f48) membar #Sync
> +     STORE_SYNC(o0, f48)
>       FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)
> -     STORE_JUMP(o0, f48, 40f) membar #Sync
> +     STORE_JUMP(o0, f48, 40f)
>  2:   FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)
> -     STORE_SYNC(o0, f48) membar #Sync
> +     STORE_SYNC(o0, f48)
>       FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)
> -     STORE_JUMP(o0, f48, 48f) membar #Sync
> +     STORE_JUMP(o0, f48, 48f)
>  3:   FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)
> -     STORE_SYNC(o0, f48) membar #Sync
> +     STORE_SYNC(o0, f48)
>       FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)
> -     STORE_JUMP(o0, f48, 56f) membar #Sync
> +     STORE_JUMP(o0, f48, 56f)
>  
>  1:   FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)
>       LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
> @@ -260,17 +263,17 @@ FUNC_NAME:              /* %o0=dst, %o1=src, %o2=len
>       ba,pt           %xcc, 1b+4
>        faligndata     %f2, %f4, %f48
>  1:   FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)
> -     STORE_SYNC(o0, f48) membar #Sync
> +     STORE_SYNC(o0, f48)
>       FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)
> -     STORE_JUMP(o0, f48, 41f) membar #Sync
> +     STORE_JUMP(o0, f48, 41f)
>  2:   FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)
> -     STORE_SYNC(o0, f48) membar #Sync
> +     STORE_SYNC(o0, f48)
>       FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)
> -     STORE_JUMP(o0, f48, 49f) membar #Sync
> +     STORE_JUMP(o0, f48, 49f)
>  3:   FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)
> -     STORE_SYNC(o0, f48) membar #Sync
> +     STORE_SYNC(o0, f48)
>       FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)
> -     STORE_JUMP(o0, f48, 57f) membar #Sync
> +     STORE_JUMP(o0, f48, 57f)
>  
>  1:   FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)
>       LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
> @@ -281,17 +284,17 @@ FUNC_NAME:              /* %o0=dst, %o1=src, %o2=len
>       ba,pt           %xcc, 1b+4
>        faligndata     %f4, %f6, %f48
>  1:   FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)
> -     STORE_SYNC(o0, f48) membar #Sync
> +     STORE_SYNC(o0, f48)
>       FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)
> -     STORE_JUMP(o0, f48, 42f) membar #Sync
> +     STORE_JUMP(o0, f48, 42f)
>  2:   FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)
> -     STORE_SYNC(o0, f48) membar #Sync
> +     STORE_SYNC(o0, f48)
>       FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)
> -     STORE_JUMP(o0, f48, 50f) membar #Sync
> +     STORE_JUMP(o0, f48, 50f)
>  3:   FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)
> -     STORE_SYNC(o0, f48) membar #Sync
> +     STORE_SYNC(o0, f48)
>       FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)
> -     STORE_JUMP(o0, f48, 58f) membar #Sync
> +     STORE_JUMP(o0, f48, 58f)
>  
>  1:   FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)
>       LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
> @@ -302,17 +305,17 @@ FUNC_NAME:              /* %o0=dst, %o1=src, %o2=len
>       ba,pt           %xcc, 1b+4
>        faligndata     %f6, %f8, %f48
>  1:   FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)
> -     STORE_SYNC(o0, f48) membar #Sync
> +     STORE_SYNC(o0, f48)
>       FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6)
> -     STORE_JUMP(o0, f48, 43f) membar #Sync
> +     STORE_JUMP(o0, f48, 43f)
>  2:   FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6)
> -     STORE_SYNC(o0, f48) membar #Sync
> +     STORE_SYNC(o0, f48)
>       FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)
> -     STORE_JUMP(o0, f48, 51f) membar #Sync
> +     STORE_JUMP(o0, f48, 51f)
>  3:   FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)
> -     STORE_SYNC(o0, f48) membar #Sync
> +     STORE_SYNC(o0, f48)
>       FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)
> -     STORE_JUMP(o0, f48, 59f) membar #Sync
> +     STORE_JUMP(o0, f48, 59f)
>  
>  1:   FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)
>       LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
> @@ -323,17 +326,17 @@ FUNC_NAME:              /* %o0=dst, %o1=src, %o2=len
>       ba,pt           %xcc, 1b+4
>        faligndata     %f8, %f10, %f48
>  1:   FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)
> -     STORE_SYNC(o0, f48) membar #Sync
> +     STORE_SYNC(o0, f48)
>       FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)
> -     STORE_JUMP(o0, f48, 44f) membar #Sync
> +     STORE_JUMP(o0, f48, 44f)
>  2:   FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)
> -     STORE_SYNC(o0, f48) membar #Sync
> +     STORE_SYNC(o0, f48)
>       FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)
> -     STORE_JUMP(o0, f48, 52f) membar #Sync
> +     STORE_JUMP(o0, f48, 52f)
>  3:   FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)
> -     STORE_SYNC(o0, f48) membar #Sync
> +     STORE_SYNC(o0, f48)
>       FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)
> -     STORE_JUMP(o0, f48, 60f) membar #Sync
> +     STORE_JUMP(o0, f48, 60f)
>  
>  1:   FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)
>       LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
> @@ -344,17 +347,17 @@ FUNC_NAME:              /* %o0=dst, %o1=src, %o2=len
>       ba,pt           %xcc, 1b+4
>        faligndata     %f10, %f12, %f48
>  1:   FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)
> -     STORE_SYNC(o0, f48) membar #Sync
> +     STORE_SYNC(o0, f48)
>       FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)
> -     STORE_JUMP(o0, f48, 45f) membar #Sync
> +     STORE_JUMP(o0, f48, 45f)
>  2:   FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)
> -     STORE_SYNC(o0, f48) membar #Sync
> +     STORE_SYNC(o0, f48)
>       FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)
> -     STORE_JUMP(o0, f48, 53f) membar #Sync
> +     STORE_JUMP(o0, f48, 53f)
>  3:   FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)
> -     STORE_SYNC(o0, f48) membar #Sync
> +     STORE_SYNC(o0, f48)
>       FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)
> -     STORE_JUMP(o0, f48, 61f) membar #Sync
> +     STORE_JUMP(o0, f48, 61f)
>  
>  1:   FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)
>       LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
> @@ -365,17 +368,17 @@ FUNC_NAME:              /* %o0=dst, %o1=src, %o2=len
>       ba,pt           %xcc, 1b+4
>        faligndata     %f12, %f14, %f48
>  1:   FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)
> -     STORE_SYNC(o0, f48) membar #Sync
> +     STORE_SYNC(o0, f48)
>       FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)
> -     STORE_JUMP(o0, f48, 46f) membar #Sync
> +     STORE_JUMP(o0, f48, 46f)
>  2:   FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)
> -     STORE_SYNC(o0, f48) membar #Sync
> +     STORE_SYNC(o0, f48)
>       FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)
> -     STORE_JUMP(o0, f48, 54f) membar #Sync
> +     STORE_JUMP(o0, f48, 54f)
>  3:   FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)
> -     STORE_SYNC(o0, f48) membar #Sync
> +     STORE_SYNC(o0, f48)
>       FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)
> -     STORE_JUMP(o0, f48, 62f) membar #Sync
> +     STORE_JUMP(o0, f48, 62f)
>  
>  1:   FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)
>       LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
> @@ -386,17 +389,17 @@ FUNC_NAME:              /* %o0=dst, %o1=src, %o2=len
>       ba,pt           %xcc, 1b+4
>        faligndata     %f14, %f16, %f48
>  1:   FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)
> -     STORE_SYNC(o0, f48) membar #Sync
> +     STORE_SYNC(o0, f48)
>       FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)
> -     STORE_JUMP(o0, f48, 47f) membar #Sync
> +     STORE_JUMP(o0, f48, 47f)
>  2:   FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)
> -     STORE_SYNC(o0, f48) membar #Sync
> +     STORE_SYNC(o0, f48)
>       FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)
> -     STORE_JUMP(o0, f48, 55f) membar #Sync
> +     STORE_JUMP(o0, f48, 55f)
>  3:   FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)
> -     STORE_SYNC(o0, f48) membar #Sync
> +     STORE_SYNC(o0, f48)
>       FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)
> -     STORE_JUMP(o0, f48, 63f) membar #Sync
> +     STORE_JUMP(o0, f48, 63f)
>  
>  40:  FINISH_VISCHUNK(o0, f0,  f2,  g3)
>  41:  FINISH_VISCHUNK(o0, f2,  f4,  g3)
> diff --git a/arch/sparc64/lib/VISsave.S b/arch/sparc64/lib/VISsave.S
> --- a/arch/sparc64/lib/VISsave.S
> +++ b/arch/sparc64/lib/VISsave.S
> @@ -72,7 +72,11 @@ vis1:      ldub            [%g6 + TI_FPSAVED], %g3
>  
>       stda            %f48, [%g3 + %g1] ASI_BLK_P
>  5:   membar          #Sync
> -     jmpl            %g7 + %g0, %g0
> +     ba,pt           %xcc, 80f
> +      nop
> +
> +     .align          32
> +80:  jmpl            %g7 + %g0, %g0
>        nop
>  
>  6:   ldub            [%g3 + TI_FPSAVED], %o5
> @@ -87,8 +91,11 @@ vis1:      ldub            [%g6 + TI_FPSAVED], %g3
>       stda            %f32, [%g2 + %g1] ASI_BLK_P
>       stda            %f48, [%g3 + %g1] ASI_BLK_P
>       membar          #Sync
> -     jmpl            %g7 + %g0, %g0
> +     ba,pt           %xcc, 80f
> +      nop
>  
> +     .align          32
> +80:  jmpl            %g7 + %g0, %g0
>        nop
>  
>       .align          32
> @@ -126,6 +133,10 @@ VISenterhalf:
>       stda            %f0, [%g2 + %g1] ASI_BLK_P
>       stda            %f16, [%g3 + %g1] ASI_BLK_P
>       membar          #Sync
> +     ba,pt           %xcc, 4f
> +      nop
> +
> +     .align          32
>  4:   and             %o5, FPRS_DU, %o5
>       jmpl            %g7 + %g0, %g0
>        wr             %o5, FPRS_FEF, %fprs
> diff --git a/arch/sparc64/lib/atomic.S b/arch/sparc64/lib/atomic.S
> --- a/arch/sparc64/lib/atomic.S
> +++ b/arch/sparc64/lib/atomic.S
> @@ -7,18 +7,6 @@
>  #include <linux/config.h>
>  #include <asm/asi.h>
>  
> -     /* On SMP we need to use memory barriers to ensure
> -      * correct memory operation ordering, nop these out
> -      * for uniprocessor.
> -      */
> -#ifdef CONFIG_SMP
> -#define ATOMIC_PRE_BARRIER   membar #StoreLoad | #LoadLoad
> -#define ATOMIC_POST_BARRIER  membar #StoreLoad | #StoreStore
> -#else
> -#define ATOMIC_PRE_BARRIER   nop
> -#define ATOMIC_POST_BARRIER  nop
> -#endif
> -
>       .text
>  
>       /* Two versions of the atomic routines, one that
> @@ -52,6 +40,24 @@ atomic_sub: /* %o0 = decrement, %o1 = at
>        nop
>       .size   atomic_sub, .-atomic_sub
>  
> +     /* On SMP we need to use memory barriers to ensure
> +      * correct memory operation ordering, nop these out
> +      * for uniprocessor.
> +      */
> +#ifdef CONFIG_SMP
> +
> +#define ATOMIC_PRE_BARRIER   membar #StoreLoad | #LoadLoad;
> +#define ATOMIC_POST_BARRIER  \
> +     ba,pt %xcc, 80b;        \
> +     membar #StoreLoad | #StoreStore
> +
> +80:  retl
> +      nop
> +#else
> +#define ATOMIC_PRE_BARRIER
> +#define ATOMIC_POST_BARRIER
> +#endif
> +
>       .globl  atomic_add_ret
>       .type   atomic_add_ret,#function
>  atomic_add_ret: /* %o0 = increment, %o1 = atomic_ptr */
> @@ -62,9 +68,10 @@ atomic_add_ret: /* %o0 = increment, %o1 
>       cmp     %g1, %g7
>       bne,pn  %icc, 1b
>        add    %g7, %o0, %g7
> +     sra     %g7, 0, %o0
>       ATOMIC_POST_BARRIER
>       retl
> -      sra    %g7, 0, %o0
> +      nop
>       .size   atomic_add_ret, .-atomic_add_ret
>  
>       .globl  atomic_sub_ret
> @@ -77,9 +84,10 @@ atomic_sub_ret: /* %o0 = decrement, %o1 
>       cmp     %g1, %g7
>       bne,pn  %icc, 1b
>        sub    %g7, %o0, %g7
> +     sra     %g7, 0, %o0
>       ATOMIC_POST_BARRIER
>       retl
> -      sra    %g7, 0, %o0
> +      nop
>       .size   atomic_sub_ret, .-atomic_sub_ret
>  
>       .globl  atomic64_add
> @@ -118,9 +126,10 @@ atomic64_add_ret: /* %o0 = increment, %o
>       cmp     %g1, %g7
>       bne,pn  %xcc, 1b
>        add    %g7, %o0, %g7
> +     mov     %g7, %o0
>       ATOMIC_POST_BARRIER
>       retl
> -      mov    %g7, %o0
> +      nop
>       .size   atomic64_add_ret, .-atomic64_add_ret
>  
>       .globl  atomic64_sub_ret
> @@ -133,7 +142,8 @@ atomic64_sub_ret: /* %o0 = decrement, %o
>       cmp     %g1, %g7
>       bne,pn  %xcc, 1b
>        sub    %g7, %o0, %g7
> +     mov     %g7, %o0
>       ATOMIC_POST_BARRIER
>       retl
> -      mov    %g7, %o0
> +      nop
>       .size   atomic64_sub_ret, .-atomic64_sub_ret
> diff --git a/arch/sparc64/lib/bitops.S b/arch/sparc64/lib/bitops.S
> --- a/arch/sparc64/lib/bitops.S
> +++ b/arch/sparc64/lib/bitops.S
> @@ -7,20 +7,26 @@
>  #include <linux/config.h>
>  #include <asm/asi.h>
>  
> +     .text
> +
>       /* On SMP we need to use memory barriers to ensure
>        * correct memory operation ordering, nop these out
>        * for uniprocessor.
>        */
> +
>  #ifdef CONFIG_SMP
>  #define BITOP_PRE_BARRIER    membar #StoreLoad | #LoadLoad
> -#define BITOP_POST_BARRIER   membar #StoreLoad | #StoreStore
> +#define BITOP_POST_BARRIER   \
> +     ba,pt   %xcc, 80b;      \
> +     membar #StoreLoad | #StoreStore
> +
> +80:  retl
> +      nop
>  #else
> -#define BITOP_PRE_BARRIER    nop
> -#define BITOP_POST_BARRIER   nop
> +#define BITOP_PRE_BARRIER
> +#define BITOP_POST_BARRIER
>  #endif
>  
> -     .text
> -
>       .globl  test_and_set_bit
>       .type   test_and_set_bit,#function
>  test_and_set_bit:    /* %o0=nr, %o1=addr */
> @@ -37,10 +43,11 @@ test_and_set_bit: /* %o0=nr, %o1=addr */
>       cmp     %g7, %g1
>       bne,pn  %xcc, 1b
>        and    %g7, %o2, %g2
> -     BITOP_POST_BARRIER
>       clr     %o0
> +     movrne  %g2, 1, %o0
> +     BITOP_POST_BARRIER
>       retl
> -      movrne %g2, 1, %o0
> +      nop
>       .size   test_and_set_bit, .-test_and_set_bit
>  
>       .globl  test_and_clear_bit
> @@ -59,10 +66,11 @@ test_and_clear_bit:       /* %o0=nr, %o1=addr 
>       cmp     %g7, %g1
>       bne,pn  %xcc, 1b
>        and    %g7, %o2, %g2
> -     BITOP_POST_BARRIER
>       clr     %o0
> +     movrne  %g2, 1, %o0
> +     BITOP_POST_BARRIER
>       retl
> -      movrne %g2, 1, %o0
> +      nop
>       .size   test_and_clear_bit, .-test_and_clear_bit
>  
>       .globl  test_and_change_bit
> @@ -81,10 +89,11 @@ test_and_change_bit:      /* %o0=nr, %o1=addr
>       cmp     %g7, %g1
>       bne,pn  %xcc, 1b
>        and    %g7, %o2, %g2
> -     BITOP_POST_BARRIER
>       clr     %o0
> +     movrne  %g2, 1, %o0
> +     BITOP_POST_BARRIER
>       retl
> -      movrne %g2, 1, %o0
> +      nop
>       .size   test_and_change_bit, .-test_and_change_bit
>  
>       .globl  set_bit
> diff --git a/arch/sparc64/lib/debuglocks.c b/arch/sparc64/lib/debuglocks.c
> --- a/arch/sparc64/lib/debuglocks.c
> +++ b/arch/sparc64/lib/debuglocks.c
> @@ -252,8 +252,9 @@ wlock_again:
>  "            andn    %%g1, %%g3, %%g7\n"
>  "            casx    [%0], %%g1, %%g7\n"
>  "            cmp     %%g1, %%g7\n"
> +"            membar  #StoreLoad | #StoreStore\n"
>  "            bne,pn  %%xcc, 1b\n"
> -"             membar #StoreLoad | #StoreStore"
> +"             nop"
>               : /* no outputs */
>               : "r" (&(rw->lock))
>               : "g3", "g1", "g7", "cc", "memory");
> @@ -351,8 +352,9 @@ int _do_write_trylock (rwlock_t *rw, cha
>  "            andn    %%g1, %%g3, %%g7\n"
>  "            casx    [%0], %%g1, %%g7\n"
>  "            cmp     %%g1, %%g7\n"
> +"            membar  #StoreLoad | #StoreStore\n"
>  "            bne,pn  %%xcc, 1b\n"
> -"             membar #StoreLoad | #StoreStore"
> +"             nop"
>               : /* no outputs */
>               : "r" (&(rw->lock))
>               : "g3", "g1", "g7", "cc", "memory");
> diff --git a/arch/sparc64/lib/dec_and_lock.S b/arch/sparc64/lib/dec_and_lock.S
> --- a/arch/sparc64/lib/dec_and_lock.S
> +++ b/arch/sparc64/lib/dec_and_lock.S
> @@ -48,8 +48,9 @@ start_to_zero:
>  #endif
>  to_zero:
>       ldstub  [%o1], %g3
> +     membar  #StoreLoad | #StoreStore
>       brnz,pn %g3, spin_on_lock
> -      membar #StoreLoad | #StoreStore
> +      nop
>  loop2:       cas     [%o0], %g2, %g7         /* ASSERT(g7 == 0) */
>       cmp     %g2, %g7
>  
> @@ -71,8 +72,9 @@ loop2:      cas     [%o0], %g2, %g7         /* ASSERT(g7
>        nop
>  spin_on_lock:
>       ldub    [%o1], %g3
> +     membar  #LoadLoad
>       brnz,pt %g3, spin_on_lock
> -      membar #LoadLoad
> +      nop
>       ba,pt   %xcc, to_zero
>        nop
>       nop
> diff --git a/arch/sparc64/lib/rwsem.S b/arch/sparc64/lib/rwsem.S
> --- a/arch/sparc64/lib/rwsem.S
> +++ b/arch/sparc64/lib/rwsem.S
> @@ -17,8 +17,9 @@ __down_read:
>       bne,pn          %icc, 1b
>        add            %g7, 1, %g7
>       cmp             %g7, 0
> +     membar          #StoreLoad | #StoreStore
>       bl,pn           %icc, 3f
> -      membar         #StoreLoad | #StoreStore
> +      nop
>  2:
>       retl
>        nop
> @@ -57,8 +58,9 @@ __down_write:
>       cmp             %g3, %g7
>       bne,pn          %icc, 1b
>        cmp            %g7, 0
> +     membar          #StoreLoad | #StoreStore
>       bne,pn          %icc, 3f
> -      membar         #StoreLoad | #StoreStore
> +      nop
>  2:   retl
>        nop
>  3:
> @@ -97,8 +99,9 @@ __up_read:
>       cmp             %g1, %g7
>       bne,pn          %icc, 1b
>        cmp            %g7, 0
> +     membar          #StoreLoad | #StoreStore
>       bl,pn           %icc, 3f
> -      membar         #StoreLoad | #StoreStore
> +      nop
>  2:   retl
>        nop
>  3:   sethi           %hi(RWSEM_ACTIVE_MASK), %g1
> @@ -126,8 +129,9 @@ __up_write:
>       bne,pn          %icc, 1b
>        sub            %g7, %g1, %g7
>       cmp             %g7, 0
> +     membar          #StoreLoad | #StoreStore
>       bl,pn           %icc, 3f
> -      membar         #StoreLoad | #StoreStore
> +      nop
>  2:
>       retl
>        nop
> @@ -151,8 +155,9 @@ __downgrade_write:
>       bne,pn          %icc, 1b
>        sub            %g7, %g1, %g7
>       cmp             %g7, 0
> +     membar          #StoreLoad | #StoreStore
>       bl,pn           %icc, 3f
> -      membar         #StoreLoad | #StoreStore
> +      nop
>  2:
>       retl
>        nop
> diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c
> --- a/arch/sparc64/mm/init.c
> +++ b/arch/sparc64/mm/init.c
> @@ -136,8 +136,9 @@ static __inline__ void set_dcache_dirty(
>                            "or        %%g1, %0, %%g1\n\t"
>                            "casx      [%2], %%g7, %%g1\n\t"
>                            "cmp       %%g7, %%g1\n\t"
> +                          "membar    #StoreLoad | #StoreStore\n\t"
>                            "bne,pn    %%xcc, 1b\n\t"
> -                          " membar   #StoreLoad | #StoreStore"
> +                          " nop"
>                            : /* no outputs */
>                            : "r" (mask), "r" (non_cpu_bits), "r" 
> (&page->flags)
>                            : "g1", "g7");
> @@ -157,8 +158,9 @@ static __inline__ void clear_dcache_dirt
>                            " andn     %%g7, %1, %%g1\n\t"
>                            "casx      [%2], %%g7, %%g1\n\t"
>                            "cmp       %%g7, %%g1\n\t"
> +                          "membar    #StoreLoad | #StoreStore\n\t"
>                            "bne,pn    %%xcc, 1b\n\t"
> -                          " membar   #StoreLoad | #StoreStore\n"
> +                          " nop\n"
>                            "2:"
>                            : /* no outputs */
>                            : "r" (cpu), "r" (mask), "r" (&page->flags),
> diff --git a/arch/sparc64/mm/ultra.S b/arch/sparc64/mm/ultra.S
> --- a/arch/sparc64/mm/ultra.S
> +++ b/arch/sparc64/mm/ultra.S
> @@ -266,8 +266,9 @@ __cheetah_flush_tlb_pending:      /* 22 insns
>        andn           %o3, 1, %o3
>       stxa            %g0, [%o3] ASI_IMMU_DEMAP
>  2:   stxa            %g0, [%o3] ASI_DMMU_DEMAP       
> +     membar          #Sync
>       brnz,pt         %o1, 1b
> -      membar         #Sync
> +      nop
>       stxa            %g2, [%o4] ASI_DMMU
>       flush           %g6
>       wrpr            %g0, 0, %tl
> diff --git a/include/asm-sparc64/rwsem.h b/include/asm-sparc64/rwsem.h
> --- a/include/asm-sparc64/rwsem.h
> +++ b/include/asm-sparc64/rwsem.h
> @@ -55,8 +55,9 @@ static __inline__ int rwsem_atomic_updat
>               "add            %%g1, %1, %%g7\n\t"
>               "cas            [%2], %%g1, %%g7\n\t"
>               "cmp            %%g1, %%g7\n\t"
> +             "membar         #StoreLoad | #StoreStore\n\t"
>               "bne,pn         %%icc, 1b\n\t"
> -             " membar        #StoreLoad | #StoreStore\n\t"
> +             " nop\n\t"
>               "mov            %%g7, %0\n\t"
>               : "=&r" (tmp)
>               : "0" (tmp), "r" (sem)
> diff --git a/include/asm-sparc64/spinlock.h b/include/asm-sparc64/spinlock.h
> --- a/include/asm-sparc64/spinlock.h
> +++ b/include/asm-sparc64/spinlock.h
> @@ -52,12 +52,14 @@ static inline void _raw_spin_lock(spinlo
>  
>       __asm__ __volatile__(
>  "1:  ldstub          [%1], %0\n"
> +"    membar          #StoreLoad | #StoreStore\n"
>  "    brnz,pn         %0, 2f\n"
> -"     membar         #StoreLoad | #StoreStore\n"
> +"     nop\n"
>  "    .subsection     2\n"
>  "2:  ldub            [%1], %0\n"
> +"    membar          #LoadLoad\n"
>  "    brnz,pt         %0, 2b\n"
> -"     membar         #LoadLoad\n"
> +"     nop\n"
>  "    ba,a,pt         %%xcc, 1b\n"
>  "    .previous"
>       : "=&r" (tmp)
> @@ -95,16 +97,18 @@ static inline void _raw_spin_lock_flags(
>  
>       __asm__ __volatile__(
>  "1:  ldstub          [%2], %0\n"
> -"    brnz,pn         %0, 2f\n"
>  "    membar          #StoreLoad | #StoreStore\n"
> +"    brnz,pn         %0, 2f\n"
> +"     nop\n"
>  "    .subsection     2\n"
>  "2:  rdpr            %%pil, %1\n"
>  "    wrpr            %3, %%pil\n"
>  "3:  ldub            [%2], %0\n"
> -"    brnz,pt         %0, 3b\n"
>  "    membar          #LoadLoad\n"
> +"    brnz,pt         %0, 3b\n"
> +"     nop\n"
>  "    ba,pt           %%xcc, 1b\n"
> -"    wrpr            %1, %%pil\n"
> +"     wrpr           %1, %%pil\n"
>  "    .previous"
>       : "=&r" (tmp1), "=&r" (tmp2)
>       : "r"(lock), "r"(flags)
> @@ -162,12 +166,14 @@ static void inline __read_lock(rwlock_t 
>  "4:   add            %0, 1, %1\n"
>  "    cas             [%2], %0, %1\n"
>  "    cmp             %0, %1\n"
> +"    membar          #StoreLoad | #StoreStore\n"
>  "    bne,pn          %%icc, 1b\n"
> -"     membar         #StoreLoad | #StoreStore\n"
> +"     nop\n"
>  "    .subsection     2\n"
>  "2:  ldsw            [%2], %0\n"
> +"    membar          #LoadLoad\n"
>  "    brlz,pt         %0, 2b\n"
> -"     membar         #LoadLoad\n"
> +"     nop\n"
>  "    ba,a,pt         %%xcc, 4b\n"
>  "    .previous"
>       : "=&r" (tmp1), "=&r" (tmp2)
> @@ -204,12 +210,14 @@ static void inline __write_lock(rwlock_t
>  "4:   or             %0, %3, %1\n"
>  "    cas             [%2], %0, %1\n"
>  "    cmp             %0, %1\n"
> +"    membar          #StoreLoad | #StoreStore\n"
>  "    bne,pn          %%icc, 1b\n"
> -"     membar         #StoreLoad | #StoreStore\n"
> +"     nop\n"
>  "    .subsection     2\n"
>  "2:  lduw            [%2], %0\n"
> +"    membar          #LoadLoad\n"
>  "    brnz,pt         %0, 2b\n"
> -"     membar         #LoadLoad\n"
> +"     nop\n"
>  "    ba,a,pt         %%xcc, 4b\n"
>  "    .previous"
>       : "=&r" (tmp1), "=&r" (tmp2)
> @@ -240,8 +248,9 @@ static int inline __write_trylock(rwlock
>  "     or             %0, %4, %1\n"
>  "    cas             [%3], %0, %1\n"
>  "    cmp             %0, %1\n"
> +"    membar          #StoreLoad | #StoreStore\n"
>  "    bne,pn          %%icc, 1b\n"
> -"     membar         #StoreLoad | #StoreStore\n"
> +"     nop\n"
>  "    mov             1, %2\n"
>  "2:"
>       : "=&r" (tmp1), "=&r" (tmp2), "=&r" (result)
> diff --git a/include/asm-sparc64/spitfire.h b/include/asm-sparc64/spitfire.h
> --- a/include/asm-sparc64/spitfire.h
> +++ b/include/asm-sparc64/spitfire.h
> @@ -111,7 +111,6 @@ static __inline__ void spitfire_put_dcac
>                            "membar    #Sync"
>                            : /* No outputs */
>                            : "r" (tag), "r" (addr), "i" (ASI_DCACHE_TAG));
> -     __asm__ __volatile__ ("membar #Sync" : : : "memory");
>  }
>  
>  /* The instruction cache lines are flushed with this, but note that
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.1 (GNU/Linux)
Comment: Using GnuPG with Thunderbird - http://enigmail.mozdev.org

iD8DBQFCwDueFAhB33r2ACYRAmcpAJ9ZJsS/tymHUBWuXgPzWUimK6Xv4gCdF0n3
cQBeEJPGePB1Uw0MwJ5sUpg=
=f/TR
-----END PGP SIGNATURE-----
-- 
[email protected] mailing list

Reply via email to