arch_cmpxchg operations

Xu Lu Sat, 20 Sep 2025 18:13:39 -0700

The existing arch_xchg/arch_cmpxchg operations are implemented by
inserting fence instructions before or after atomic instructions.
This commit replaces them with real acquire/release semantics.


|----------------------------------------------------------------|
|    |    arch_xchg_release       |     arch_cmpxchg_release     |
|    |-----------------------------------------------------------|
|    | zabha      | !zabha        | zabha+zacas | !(zabha+zacas) |
| rl |-----------------------------------------------------------|
|    |            | (fence rw, w) |             | (fence rw, w)  |
|    | amoswap.rl | lr.w          | amocas.rl   | lr.w           |
|    |            | sc.w.rl       |             | sc.w.rl        |
|----------------------------------------------------------------|
|    |    arch_xchg_acquire       |     arch_cmpxchg_acquire     |
|    |-----------------------------------------------------------|
|    | zabha      | !zabha        | zabha+zacas | !(zabha+zacas) |
| aq |-----------------------------------------------------------|
|    |            | lr.w.aq       |             | lr.w.aq        |
|    | amoswap.aq | sc.w          | amocas.aq   | sc.w           |
|    |            | (fence r, rw) |             | (fence r, rw)  |
|----------------------------------------------------------------|

(fence rw, w), (fence r, rw) here means such instructions will only
be inserted when zalasr is not implemented.

Signed-off-by: Xu Lu <luxu.ker...@bytedance.com>
---
 arch/riscv/include/asm/atomic.h  |   6 --
 arch/riscv/include/asm/cmpxchg.h | 136 ++++++++++++++-----------------
 2 files changed, 63 insertions(+), 79 deletions(-)

diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h
index 5b96c2f61adb5..b79a4f889f339 100644
--- a/arch/riscv/include/asm/atomic.h
+++ b/arch/riscv/include/asm/atomic.h
@@ -18,12 +18,6 @@
 
 #include <asm/cmpxchg.h>
 
-#define __atomic_acquire_fence()                                       \
-       __asm__ __volatile__(RISCV_ACQUIRE_BARRIER "" ::: "memory")
-
-#define __atomic_release_fence()                                       \
-       __asm__ __volatile__(RISCV_RELEASE_BARRIER "" ::: "memory");
-
 static __always_inline int arch_atomic_read(const atomic_t *v)
 {
        return READ_ONCE(v->counter);
diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
index 0b749e7102162..207fdba38d1fc 100644
--- a/arch/riscv/include/asm/cmpxchg.h
+++ b/arch/riscv/include/asm/cmpxchg.h
@@ -15,15 +15,23 @@
 #include <asm/cpufeature-macros.h>
 #include <asm/processor.h>
 
-#define __arch_xchg_masked(sc_sfx, swap_sfx, prepend, sc_append,               
\
-                          swap_append, r, p, n)                                
\
+/*
+ * These macros are here to improve the readability of the arch_xchg_XXX()
+ * and arch_cmpxchg_XXX() macros.
+ */
+#define LR_SFX(x)              x
+#define SC_SFX(x)              x
+#define CAS_SFX(x)             x
+#define SC_PREPEND(x)          x
+#define SC_APPEND(x)           x
+
+#define __arch_xchg_masked(lr_sfx, sc_sfx, swap_sfx, sc_prepend, sc_append,    
\
+                          r, p, n)                                             
\
 ({                                                                             
\
        if (IS_ENABLED(CONFIG_RISCV_ISA_ZABHA) &&                               
\
            riscv_has_extension_unlikely(RISCV_ISA_EXT_ZABHA)) {                
\
                __asm__ __volatile__ (                                          
\
-                       prepend                                                 
\
                        "       amoswap" swap_sfx " %0, %z2, %1\n"              
\
-                       swap_append                                             
\
                        : "=&r" (r), "+A" (*(p))                                
\
                        : "rJ" (n)                                              
\
                        : "memory");                                            
\
@@ -37,14 +45,16 @@
                ulong __rc;                                                     
\
                                                                                
\
                __asm__ __volatile__ (                                          
\
-                      prepend                                                  
\
                       PREFETCHW_ASM(%5)                                        
\
+                      ALTERNATIVE(__nops(1), sc_prepend,                       
\
+                                  0, RISCV_ISA_EXT_ZALASR, 1)                  
\
                       "0:      lr.w %0, %2\n"                                  
\
                       "        and  %1, %0, %z4\n"                             
\
                       "        or   %1, %1, %z3\n"                             
\
                       "        sc.w" sc_sfx " %1, %1, %2\n"                    
\
                       "        bnez %1, 0b\n"                                  
\
-                      sc_append                                                
\
+                      ALTERNATIVE(__nops(1), sc_append,                        
\
+                                  0, RISCV_ISA_EXT_ZALASR, 1)                  
\
                       : "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b))       
\
                       : "rJ" (__newx), "rJ" (~__mask), "rJ" (__ptr32b)         
\
                       : "memory");                                             
\
@@ -53,19 +63,17 @@
        }                                                                       
\
 })
 
-#define __arch_xchg(sfx, prepend, append, r, p, n)                     \
+#define __arch_xchg(sfx, r, p, n)                                      \
 ({                                                                     \
        __asm__ __volatile__ (                                          \
-               prepend                                                 \
                "       amoswap" sfx " %0, %2, %1\n"                    \
-               append                                                  \
                : "=r" (r), "+A" (*(p))                                 \
                : "r" (n)                                               \
                : "memory");                                            \
 })
 
-#define _arch_xchg(ptr, new, sc_sfx, swap_sfx, prepend,                        
\
-                  sc_append, swap_append)                              \
+#define _arch_xchg(ptr, new, lr_sfx, sc_sfx, swap_sfx,                 \
+                  sc_prepend, sc_append)                               \
 ({                                                                     \
        __typeof__(ptr) __ptr = (ptr);                                  \
        __typeof__(*(__ptr)) __new = (new);                             \
@@ -73,22 +81,20 @@
                                                                        \
        switch (sizeof(*__ptr)) {                                       \
        case 1:                                                         \
-               __arch_xchg_masked(sc_sfx, ".b" swap_sfx,               \
-                                  prepend, sc_append, swap_append,     \
+               __arch_xchg_masked(lr_sfx, sc_sfx, ".b" swap_sfx,       \
+                                  sc_prepend, sc_append,               \
                                   __ret, __ptr, __new);                \
                break;                                                  \
        case 2:                                                         \
-               __arch_xchg_masked(sc_sfx, ".h" swap_sfx,               \
-                                  prepend, sc_append, swap_append,     \
+               __arch_xchg_masked(lr_sfx, sc_sfx, ".h" swap_sfx,       \
+                                  sc_prepend, sc_append,               \
                                   __ret, __ptr, __new);                \
                break;                                                  \
        case 4:                                                         \
-               __arch_xchg(".w" swap_sfx, prepend, swap_append,        \
-                             __ret, __ptr, __new);                     \
+               __arch_xchg(".w" swap_sfx, __ret, __ptr, __new);        \
                break;                                                  \
        case 8:                                                         \
-               __arch_xchg(".d" swap_sfx, prepend, swap_append,        \
-                             __ret, __ptr, __new);                     \
+               __arch_xchg(".d" swap_sfx, __ret, __ptr, __new);        \
                break;                                                  \
        default:                                                        \
                BUILD_BUG();                                            \
@@ -97,17 +103,23 @@
 })
 
 #define arch_xchg_relaxed(ptr, x)                                      \
-       _arch_xchg(ptr, x, "", "", "", "", "")
+       _arch_xchg(ptr, x, LR_SFX(""), SC_SFX(""), CAS_SFX(""),         \
+                  SC_PREPEND(__nops(1)), SC_APPEND(__nops(1)))
 
 #define arch_xchg_acquire(ptr, x)                                      \
-       _arch_xchg(ptr, x, "", "", "",                                  \
-                  RISCV_ACQUIRE_BARRIER, RISCV_ACQUIRE_BARRIER)
+       _arch_xchg(ptr, x, LR_SFX(".aq"), SC_SFX(""), CAS_SFX(".aq"),   \
+                  SC_PREPEND(__nops(1)),                               \
+                  SC_APPEND(RISCV_ACQUIRE_BARRIER))
 
 #define arch_xchg_release(ptr, x)                                      \
-       _arch_xchg(ptr, x, "", "", RISCV_RELEASE_BARRIER, "", "")
+       _arch_xchg(ptr, x, LR_SFX(""), SC_SFX(".rl"), CAS_SFX(".rl"),   \
+                  SC_PREPEND(RISCV_RELEASE_BARRIER),                   \
+                  SC_APPEND(__nops(1)))
 
 #define arch_xchg(ptr, x)                                              \
-       _arch_xchg(ptr, x, ".rl", ".aqrl", "", RISCV_FULL_BARRIER, "")
+       _arch_xchg(ptr, x, LR_SFX(""), SC_SFX(".aqrl"),                 \
+                  CAS_SFX(".aqrl"), SC_PREPEND(__nops(1)),             \
+                  SC_APPEND(__nops(1)))
 
 #define xchg32(ptr, x)                                                 \
 ({                                                                     \
@@ -126,9 +138,7 @@
  * store NEW in MEM.  Return the initial value in MEM.  Success is
  * indicated by comparing RETURN with OLD.
  */
-#define __arch_cmpxchg_masked(sc_sfx, cas_sfx,                                 
\
-                             sc_prepend, sc_append,                            
\
-                             cas_prepend, cas_append,                          
\
+#define __arch_cmpxchg_masked(lr_sfx, sc_sfx, cas_sfx, sc_prepend, sc_append,  
\
                              r, p, o, n)                                       
\
 ({                                                                             
\
        if (IS_ENABLED(CONFIG_RISCV_ISA_ZABHA) &&                               
\
@@ -138,9 +148,7 @@
                r = o;                                                          
\
                                                                                
\
                __asm__ __volatile__ (                                          
\
-                       cas_prepend                                             
        \
                        "       amocas" cas_sfx " %0, %z2, %1\n"                
\
-                       cas_append                                              
        \
                        : "+&r" (r), "+A" (*(p))                                
\
                        : "rJ" (n)                                              
\
                        : "memory");                                            
\
@@ -155,15 +163,17 @@
                ulong __rc;                                                     
\
                                                                                
\
                __asm__ __volatile__ (                                          
\
-                       sc_prepend                                              
        \
-                       "0:     lr.w %0, %2\n"                                  
\
+                       ALTERNATIVE(__nops(1), sc_prepend,                      
\
+                                   0, RISCV_ISA_EXT_ZALASR, 1)                 
\
+                       "0:     lr.w" lr_sfx " %0, %2\n"                        
\
                        "       and  %1, %0, %z5\n"                             
\
                        "       bne  %1, %z3, 1f\n"                             
\
                        "       and  %1, %0, %z6\n"                             
\
                        "       or   %1, %1, %z4\n"                             
\
                        "       sc.w" sc_sfx " %1, %1, %2\n"                    
\
                        "       bnez %1, 0b\n"                                  
\
-                       sc_append                                               
        \
+                       ALTERNATIVE(__nops(1), sc_append,                       
\
+                                   0, RISCV_ISA_EXT_ZALASR, 1)                 
\
                        "1:\n"                                                  
\
                        : "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b))      
\
                        : "rJ" ((long)__oldx), "rJ" (__newx),                   
\
@@ -174,9 +184,7 @@
        }                                                                       
\
 })
 
-#define __arch_cmpxchg(lr_sfx, sc_sfx, cas_sfx,                                
\
-                      sc_prepend, sc_append,                           \
-                      cas_prepend, cas_append,                         \
+#define __arch_cmpxchg(lr_sfx, sc_sfx, cas_sfx,        sc_prepend, sc_append,  
\
                       r, p, co, o, n)                                  \
 ({                                                                     \
        if (IS_ENABLED(CONFIG_RISCV_ISA_ZACAS) &&                       \
@@ -184,9 +192,7 @@
                r = o;                                                  \
                                                                        \
                __asm__ __volatile__ (                                  \
-                       cas_prepend                                     \
                        "       amocas" cas_sfx " %0, %z2, %1\n"        \
-                       cas_append                                      \
                        : "+&r" (r), "+A" (*(p))                        \
                        : "rJ" (n)                                      \
                        : "memory");                                    \
@@ -194,12 +200,14 @@
                register unsigned int __rc;                             \
                                                                        \
                __asm__ __volatile__ (                                  \
-                       sc_prepend                                      \
+                       ALTERNATIVE(__nops(1), sc_prepend,              \
+                                   0, RISCV_ISA_EXT_ZALASR, 1)         \
                        "0:     lr" lr_sfx " %0, %2\n"                  \
                        "       bne  %0, %z3, 1f\n"                     \
                        "       sc" sc_sfx " %1, %z4, %2\n"             \
                        "       bnez %1, 0b\n"                          \
-                       sc_append                                       \
+                       ALTERNATIVE(__nops(1), sc_append,               \
+                                   0, RISCV_ISA_EXT_ZALASR, 1)         \
                        "1:\n"                                          \
                        : "=&r" (r), "=&r" (__rc), "+A" (*(p))          \
                        : "rJ" (co o), "rJ" (n)                         \
@@ -207,9 +215,8 @@
        }                                                               \
 })
 
-#define _arch_cmpxchg(ptr, old, new, sc_sfx, cas_sfx,                  \
-                     sc_prepend, sc_append,                            \
-                     cas_prepend, cas_append)                          \
+#define _arch_cmpxchg(ptr, old, new, lr_sfx, sc_sfx, cas_sfx,          \
+                     sc_prepend, sc_append)                            \
 ({                                                                     \
        __typeof__(ptr) __ptr = (ptr);                                  \
        __typeof__(*(__ptr)) __old = (old);                             \
@@ -218,27 +225,23 @@
                                                                        \
        switch (sizeof(*__ptr)) {                                       \
        case 1:                                                         \
-               __arch_cmpxchg_masked(sc_sfx, ".b" cas_sfx,             \
+               __arch_cmpxchg_masked(lr_sfx, sc_sfx, ".b" cas_sfx,     \
                                      sc_prepend, sc_append,            \
-                                     cas_prepend, cas_append,          \
                                      __ret, __ptr, __old, __new);      \
                break;                                                  \
        case 2:                                                         \
-               __arch_cmpxchg_masked(sc_sfx, ".h" cas_sfx,             \
+               __arch_cmpxchg_masked(lr_sfx, sc_sfx, ".h" cas_sfx,     \
                                      sc_prepend, sc_append,            \
-                                     cas_prepend, cas_append,          \
                                      __ret, __ptr, __old, __new);      \
                break;                                                  \
        case 4:                                                         \
-               __arch_cmpxchg(".w", ".w" sc_sfx, ".w" cas_sfx,         \
+               __arch_cmpxchg(".w" lr_sfx, ".w" sc_sfx, ".w" cas_sfx,  \
                               sc_prepend, sc_append,                   \
-                              cas_prepend, cas_append,                 \
                               __ret, __ptr, (long)(int)(long), __old, __new);  
\
                break;                                                  \
        case 8:                                                         \
-               __arch_cmpxchg(".d", ".d" sc_sfx, ".d" cas_sfx,         \
+               __arch_cmpxchg(".d" lr_sfx, ".d" sc_sfx, ".d" cas_sfx,  \
                               sc_prepend, sc_append,                   \
-                              cas_prepend, cas_append,                 \
                               __ret, __ptr, /**/, __old, __new);       \
                break;                                                  \
        default:                                                        \
@@ -247,40 +250,27 @@
        (__typeof__(*(__ptr)))__ret;                                    \
 })
 
-/*
- * These macros are here to improve the readability of the arch_cmpxchg_XXX()
- * macros.
- */
-#define SC_SFX(x)      x
-#define CAS_SFX(x)     x
-#define SC_PREPEND(x)  x
-#define SC_APPEND(x)   x
-#define CAS_PREPEND(x) x
-#define CAS_APPEND(x)  x
-
 #define arch_cmpxchg_relaxed(ptr, o, n)                                        
\
        _arch_cmpxchg((ptr), (o), (n),                                  \
-                     SC_SFX(""), CAS_SFX(""),                          \
-                     SC_PREPEND(""), SC_APPEND(""),                    \
-                     CAS_PREPEND(""), CAS_APPEND(""))
+                     LR_SFX(""), SC_SFX(""), CAS_SFX(""),              \
+                     SC_PREPEND(__nops(1)), SC_APPEND(__nops(1)))
 
 #define arch_cmpxchg_acquire(ptr, o, n)                                        
\
        _arch_cmpxchg((ptr), (o), (n),                                  \
-                     SC_SFX(""), CAS_SFX(""),                          \
-                     SC_PREPEND(""), SC_APPEND(RISCV_ACQUIRE_BARRIER), \
-                     CAS_PREPEND(""), CAS_APPEND(RISCV_ACQUIRE_BARRIER))
+                     LR_SFX(".aq"), SC_SFX(""), CAS_SFX(".aq"),        \
+                     SC_PREPEND(__nops(1)),                            \
+                     SC_APPEND(RISCV_ACQUIRE_BARRIER))
 
 #define arch_cmpxchg_release(ptr, o, n)                                        
\
        _arch_cmpxchg((ptr), (o), (n),                                  \
-                     SC_SFX(""), CAS_SFX(""),                          \
-                     SC_PREPEND(RISCV_RELEASE_BARRIER), SC_APPEND(""), \
-                     CAS_PREPEND(RISCV_RELEASE_BARRIER), CAS_APPEND(""))
+                     LR_SFX(""), SC_SFX(".rl"), CAS_SFX(".rl"),        \
+                     SC_PREPEND(RISCV_RELEASE_BARRIER),                \
+                     SC_APPEND(__nops(1)))
 
 #define arch_cmpxchg(ptr, o, n)                                                
\
        _arch_cmpxchg((ptr), (o), (n),                                  \
-                     SC_SFX(".rl"), CAS_SFX(".aqrl"),                  \
-                     SC_PREPEND(""), SC_APPEND(RISCV_FULL_BARRIER),    \
-                     CAS_PREPEND(""), CAS_APPEND(""))
+                     LR_SFX(""), SC_SFX(".aqrl"), CAS_SFX(".aqrl"),    \
+                     SC_PREPEND(__nops(1)), SC_APPEND(__nops(1)))
 
 #define arch_cmpxchg_local(ptr, o, n)                                  \
        arch_cmpxchg_relaxed((ptr), (o), (n))
-- 
2.20.1

[PATCH v3 6/8] riscv: Apply acquire/release semantics to arch_xchg/arch_cmpxchg operations

Reply via email to