riscv: introduce cmpxchg.h

Jan Beulich Tue, 13 Feb 2024 02:38:11 -0800

On 05.02.2024 16:32, Oleksii Kurochko wrote:
> --- /dev/null
> +++ b/xen/arch/riscv/include/asm/cmpxchg.h
> @@ -0,0 +1,237 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/* Copyright (C) 2014 Regents of the University of California */
> +
> +#ifndef _ASM_RISCV_CMPXCHG_H
> +#define _ASM_RISCV_CMPXCHG_H
> +
> +#include <xen/compiler.h>
> +#include <xen/lib.h>
> +
> +#include <asm/fence.h>
> +#include <asm/io.h>
> +#include <asm/system.h>
> +
> +#define ALIGN_DOWN(addr, size)  ((addr) & (~((size) - 1)))


This feels risky: Consider what happens when someone passes 2U as 2nd argument.
The cheapest adjustment to make would be to use 1UL in the expression.

> +#define __amoswap_generic(ptr, new, ret, sfx, release_barrier, 
> acquire_barrier) \
> +({ \
> +    asm volatile( \
> +        release_barrier \
> +        " amoswap" sfx " %0, %2, %1\n" \

While I won't insist, the revision log says \n were dropped from asm()
where not needed. A separator is needed here only if ...

> +        acquire_barrier \

... this isn't blank. Which imo suggests that the separator should be
part of the argument passed in. But yes, one can view this differently,
hence why I said I won't insist.

As to the naming of the two  - I'd generally suggest to make as litte
implications as possible: It doesn't really matter here whether it's
acquire or release; that matters at the use sites. What matters here
is that one is a "pre" barrier and the other is a "post" one.

> +        : "=r" (ret), "+A" (*ptr) \
> +        : "r" (new) \
> +        : "memory" ); \
> +})
> +
> +#define emulate_xchg_1_2(ptr, new, ret, release_barrier, acquire_barrier) \
> +({ \
> +    uint32_t *ptr_32b_aligned = (uint32_t *)ALIGN_DOWN((unsigned long)ptr, 
> 4); \

You now appear to assume that this macro is only used with inputs not
crossing word boundaries. That's okay as long as suitably guaranteed
at the use sites, but imo wants saying in a comment.

> +    uint8_t mask_l = ((unsigned long)(ptr) & (0x8 - sizeof(*ptr))) * 
> BITS_PER_BYTE; \

Why 0x8 (i.e. spanning 64 bits), not 4 (matching the uint32_t use
above)?

> +    uint8_t mask_size = sizeof(*ptr) * BITS_PER_BYTE; \
> +    uint8_t mask_h = mask_l + mask_size - 1; \
> +    unsigned long mask = GENMASK(mask_h, mask_l); \

Personally I find this confusing, naming-wise: GENMASK() takes bit
positions as inputs, not masks. (Initially, because of this, I
thought the calculations all can't be quite right.)

> +    unsigned long new_ = (unsigned long)(new) << mask_l; \
> +    unsigned long ret_; \
> +    unsigned long rc; \

Similarly, why unsigned long here?

I also wonder about the mix of underscore suffixed (or not) variable
names here.

> +    \
> +    asm volatile( \

Nit: Missing blank before opening parenthesis.

> +        release_barrier \
> +        "0: lr.d %0, %2\n" \

Even here it's an 8-byte access. Even if - didn't check - the insn was
okay to use with just a 4-byte aligned pointer, wouldn't it make sense
then to 8-byte align it, and be consistent throughout this macro wrt
the base unit acted upon? Alternatively, why not use lr.w here, thus
reducing possible collisions between multiple CPUs accessing the same
cache line?

> +        "   and  %1, %0, %z4\n" \
> +        "   or   %1, %1, %z3\n" \
> +        "   sc.d %1, %1, %2\n" \
> +        "   bnez %1, 0b\n" \
> +        acquire_barrier \
> +        : "=&r" (ret_), "=&r" (rc), "+A" (*ptr_32b_aligned) \
> +        : "rJ" (new_), "rJ" (~mask) \

I think that as soon as there are more than 2 or maybe 3 operands,
legibility is vastly improved by using named asm() operands.

> +        : "memory"); \

Nit: Missing blank before closing parenthesis.

> +    \
> +    ret = (__typeof__(*(ptr)))((ret_ & mask) >> mask_l); \
> +})

Why does "ret" need to be a macro argument? If you had only the
expression here, not the the assigment, ...

> +#define __xchg_generic(ptr, new, size, sfx, release_barrier, 
> acquire_barrier) \
> +({ \
> +    __typeof__(ptr) ptr__ = (ptr); \

Is this local variable really needed? Can't you use "ptr" directly
in the three macro invocations?

> +    __typeof__(*(ptr)) new__ = (new); \
> +    __typeof__(*(ptr)) ret__; \
> +    switch (size) \
> +    { \
> +    case 1: \
> +    case 2: \
> +        emulate_xchg_1_2(ptr__, new__, ret__, release_barrier, 
> acquire_barrier); \

... this would become

        ret__ = emulate_xchg_1_2(ptr__, new__, release_barrier, 
acquire_barrier); \

But, unlike assumed above, there's no enforcement here that a 2-byte
quantity won't cross a word, double-word, cache line, or even page
boundary. That might be okay if then the code would simply crash (like
the AMO insns emitted further down would), but aiui silent misbehavior
would result.

Also nit: The switch() higher up is (still/again) missing blanks.

> +        break; \
> +    case 4: \
> +        __amoswap_generic(ptr__, new__, ret__,\
> +                          ".w" sfx,  release_barrier, acquire_barrier); \
> +        break; \
> +    case 8: \
> +        __amoswap_generic(ptr__, new__, ret__,\
> +                          ".d" sfx,  release_barrier, acquire_barrier); \
> +        break; \
> +    default: \
> +        STATIC_ASSERT_UNREACHABLE(); \
> +    } \
> +    ret__; \
> +})
> +
> +#define xchg_relaxed(ptr, x) \
> +({ \
> +    __typeof__(*(ptr)) x_ = (x); \
> +    (__typeof__(*(ptr)))__xchg_generic(ptr, x_, sizeof(*(ptr)), "", "", ""); 
> \
> +})
> +
> +#define xchg_acquire(ptr, x) \
> +({ \
> +    __typeof__(*(ptr)) x_ = (x); \
> +    (__typeof__(*(ptr)))__xchg_generic(ptr, x_, sizeof(*(ptr)), \
> +                                       "", "", RISCV_ACQUIRE_BARRIER); \
> +})
> +
> +#define xchg_release(ptr, x) \
> +({ \
> +    __typeof__(*(ptr)) x_ = (x); \
> +    (__typeof__(*(ptr)))__xchg_generic(ptr, x_, sizeof(*(ptr)),\
> +                                       "", RISCV_RELEASE_BARRIER, ""); \
> +})
> +
> +#define xchg(ptr,x) \
> +({ \
> +    __typeof__(*(ptr)) ret__; \
> +    ret__ = (__typeof__(*(ptr))) \
> +            __xchg_generic(ptr, (unsigned long)(x), sizeof(*(ptr)), \
> +                           ".aqrl", "", ""); \

The .aqrl doesn't look to affect the (emulated) 1- and 2-byte cases.

Further, amoswap also exists in release-only and acquire-only forms.
Why do you prefer explicit barrier insns over those? (Looks to
similarly apply to the emulation path as well as to the cmpxchg
machinery then, as both lr and sc also come in all four possible
acquire/release forms. Perhaps for the emulation path using
explicit barriers is better, in case the acquire/release forms of
lr/sc - being used inside the loop - might perform worse.)

> +    ret__; \
> +})
> +
> +#define __generic_cmpxchg(ptr, old, new, ret, lr_sfx, sc_sfx, 
> release_barrier, acquire_barrier)      \
> + ({ \
> +    register unsigned int rc; \
> +    asm volatile( \
> +        release_barrier \
> +        "0: lr" lr_sfx " %0, %2\n" \
> +        "   bne  %0, %z3, 1f\n" \
> +        "   sc" sc_sfx " %1, %z4, %2\n" \
> +        "   bnez %1, 0b\n" \
> +        acquire_barrier \
> +        "1:\n" \
> +        : "=&r" (ret), "=&r" (rc), "+A" (*ptr) \
> +        : "rJ" (old), "rJ" (new) \
> +        : "memory"); \
> + })
> +
> +#define emulate_cmpxchg_1_2(ptr, old, new, ret, sc_sfx, release_barrier, 
> acquire_barrier) \
> +({ \
> +    uint32_t *ptr_32b_aligned = (uint32_t *)ALIGN_DOWN((unsigned long)ptr, 
> 4); \
> +    uint8_t mask_l = ((unsigned long)(ptr) & (0x8 - sizeof(*ptr))) * 
> BITS_PER_BYTE; \
> +    uint8_t mask_size = sizeof(*ptr) * BITS_PER_BYTE; \
> +    uint8_t mask_h = mask_l + mask_size - 1; \
> +    unsigned long mask = GENMASK(mask_h, mask_l); \
> +    unsigned long old_ = (unsigned long)(old) << mask_l; \
> +    unsigned long new_ = (unsigned long)(new) << mask_l; \
> +    unsigned long ret_; \
> +    unsigned long rc; \
> +    \
> +    __asm__ __volatile__ ( \
> +        release_barrier \
> +        "0: lr.d %0, %2\n" \
> +        "   and  %1, %0, %z5\n" \
> +        "   bne  %1, %z3, 1f\n" \
> +        "   and  %1, %0, %z6\n" \

Isn't this equivalent to

        "   xor  %1, %1, %0\n" \

this eliminating one (likely register) input?

Furthermore with the above and ...

> +        "   or   %1, %1, %z4\n" \
> +        "   sc.d" sc_sfx " %1, %1, %2\n" \
> +        "   bnez %1, 0b\n" \

... this re-written to

        "   xor  %0, %1, %0\n" \
        "   or   %0, %0, %z4\n" \
        "   sc.d" sc_sfx " %0, %0, %2\n" \
        "   bnez %0, 0b\n" \

you'd then no longer clobber the ret_ & mask you've already calculated
in %1, so ...

> +        acquire_barrier \
> +        "1:\n" \
> +        : "=&r" (ret_), "=&r" (rc), "+A" (*ptr_32b_aligned) \
> +        : "rJ" (old_), "rJ" (new_), \
> +          "rJ" (mask), "rJ" (~mask) \
> +        : "memory"); \
> +    \
> +    ret = (__typeof__(*(ptr)))((ret_ & mask) >> mask_l); \

... you could use rc here. (Of course variable naming or use then may
want changing, assuming I understand why "rc" is named the way it is.)

> +})
> +
> +/*
> + * Atomic compare and exchange.  Compare OLD with MEM, if identical,
> + * store NEW in MEM.  Return the initial value in MEM.  Success is
> + * indicated by comparing RETURN with OLD.
> + */
> +#define __cmpxchg_generic(ptr, old, new, size, sc_sfx, release_barrier, 
> acquire_barrier) \
> +({ \
> +    __typeof__(ptr) ptr__ = (ptr); \
> +    __typeof__(*(ptr)) old__ = (__typeof__(*(ptr)))(old); \
> +    __typeof__(*(ptr)) new__ = (__typeof__(*(ptr)))(new); \
> +    __typeof__(*(ptr)) ret__; \
> +    switch (size) \
> +    { \
> +    case 1: \
> +    case 2: \
> +        emulate_cmpxchg_1_2(ptr, old, new, ret__,\
> +                            sc_sfx, release_barrier, acquire_barrier); \
> +        break; \
> +    case 4: \
> +        __generic_cmpxchg(ptr__, old__, new__, ret__, \
> +                          ".w", ".w"sc_sfx, release_barrier, 
> acquire_barrier); \
> +        break; \
> +    case 8: \
> +        __generic_cmpxchg(ptr__, old__, new__, ret__, \
> +                          ".d", ".d"sc_sfx, release_barrier, 
> acquire_barrier); \
> +        break; \
> +    default: \
> +        STATIC_ASSERT_UNREACHABLE(); \
> +    } \
> +    ret__; \
> +})
> +
> +#define cmpxchg_relaxed(ptr, o, n) \
> +({ \
> +    __typeof__(*(ptr)) o_ = (o); \
> +    __typeof__(*(ptr)) n_ = (n); \
> +    (__typeof__(*(ptr)))__cmpxchg_generic(ptr, \
> +                    o_, n_, sizeof(*(ptr)), "", "", ""); \
> +})
> +
> +#define cmpxchg_acquire(ptr, o, n) \
> +({ \
> +    __typeof__(*(ptr)) o_ = (o); \
> +    __typeof__(*(ptr)) n_ = (n); \
> +    (__typeof__(*(ptr)))__cmpxchg_generic(ptr, o_, n_, sizeof(*(ptr)), \
> +                                          "", "", RISCV_ACQUIRE_BARRIER); \
> +})
> +
> +#define cmpxchg_release(ptr, o, n) \
> +({ \
> +    __typeof__(*(ptr)) o_ = (o); \
> +    __typeof__(*(ptr)) n_ = (n); \
> +    (__typeof__(*(ptr)))__cmpxchg_release(ptr, o_, n_, sizeof(*(ptr)), \
> +                                          "", RISCV_RELEASE_BARRIER, ""); \
> +})
> +
> +#define cmpxchg(ptr, o, n) \
> +({ \
> +    __typeof__(*(ptr)) ret__; \
> +    ret__ = (__typeof__(*(ptr))) \
> +            __cmpxchg_generic(ptr, (unsigned long)(o), (unsigned long)(n), \
> +                              sizeof(*(ptr)), ".rl", "", " fence rw, rw\n"); 
> \

No RISCV_..._BARRIER for use here and ...

> +    ret__; \
> +})
> +
> +#define __cmpxchg(ptr, o, n, s) \
> +({ \
> +    __typeof__(*(ptr)) ret__; \
> +    ret__ = (__typeof__(*(ptr))) \
> +            __cmpxchg_generic(ptr, (unsigned long)(o), (unsigned long)(n), \
> +                              s, ".rl", "", " fence rw, rw\n"); \

... here? And anyway, wouldn't it make sense to have

#define cmpxchg(ptr, o, n) __cmpxchg(ptr, o, n, sizeof(*(ptr))

to limit redundancy?

Plus wouldn't

#define __cmpxchg(ptr, o, n, s) \
    ((__typeof__(*(ptr))) \
     __cmpxchg_generic(ptr, (unsigned long)(o), (unsigned long)(n), \
                       s, ".rl", "", " fence rw, rw\n"))

be shorter and thus easier to follow as well? As I notice only now,
this would apparently apply further up as well.

Jan

Re: [PATCH v4 12/30] xen/riscv: introduce cmpxchg.h

Reply via email to