On 17 February 2018 at 18:22, Richard Henderson
wrote:
> Signed-off-by: Richard Henderson
> ---
> diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
> index 86cd792cdf..ae433861f8 100644
> --- a/target/arm/sve_helper.c
> +++ b/target/arm/sve_helper.c
> @@ -46,14 +46,14 @@
> *
> * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
> * and bit 0 set if C is set.
> - *
> - * This is an iterative function, called for each Pd and Pg word
> - * moving forward.
> */
>
> /* For no G bits set, NZCV = C. */
> #define PREDTEST_INIT 1
>
> +/* This is an iterative function, called for each Pd and Pg word
> + * moving forward.
> + */
Why move this comment?
> static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
> {
> if (likely(g)) {
> @@ -73,6 +73,28 @@ static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g,
> uint32_t flags)
> return flags;
> }
>
> +/* This is an iterative function, called for each Pd and Pg word
> + * moving backward.
> + */
> +static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
> +{
> +if (likely(g)) {
> +/* Compute C from first (i.e last) !(D & G).
> + Use bit 2 to signal first G bit seen. */
> +if (!(flags & 4)) {
> +flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
> +flags |= (d & pow2floor(g)) == 0;
> +}
> +
> +/* Accumulate Z from each D & G. */
> +flags |= ((d & g) != 0) << 1;
> +
> +/* Compute N from last (i.e first) D & G. Replace previous. */
> +flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
> +}
> +return flags;
> +}
> +
> /* The same for a single word predicate. */
> uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
> {
> @@ -2180,3 +2202,168 @@ void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void
> *vm,
> d[i] = (pg[H1(i)] & 1 ? nn : mm);
> }
> }
> +
> +/* Two operand comparison controlled by a predicate.
> + * ??? It is very tempting to want to be able to expand this inline
> + * with x86 instructions, e.g.
> + *
> + *vcmpeqwzm, zn, %ymm0
> + *vpmovmskb %ymm0, %eax
> + *and$0x, %eax
> + *andpg, %eax
> + *
> + * or even aarch64, e.g.
> + *
> + *// mask = 4000 1000 0400 0100 0040 0010 0004 0001
> + *cmeq v0.8h, zn, zm
> + *andv0.8h, v0.8h, mask
> + *addv h0, v0.8h
> + *andv0.8b, pg
> + *
> + * However, coming up with an abstraction that allows vector inputs and
> + * a scalar output, and also handles the byte-ordering of sub-uint64_t
> + * scalar outputs, is tricky.
> + */
> +#define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK)
> \
> +uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
> \
> +{
> \
> +intptr_t opr_sz = simd_oprsz(desc);
> \
> +uint32_t flags = PREDTEST_INIT;
> \
> +intptr_t i = opr_sz;
> \
> +do {
> \
> +uint64_t out = 0, pg;
> \
> +do {
> \
> +i -= sizeof(TYPE), out <<= sizeof(TYPE);
> \
> +TYPE nn = *(TYPE *)(vn + H(i));
> \
> +TYPE mm = *(TYPE *)(vm + H(i));
> \
> +out |= nn OP mm;
> \
> +} while (i & 63);
> \
> +pg = *(uint64_t *)(vg + (i >> 3)) & MASK;
> \
> +out &= pg;
> \
> +*(uint64_t *)(vd + (i >> 3)) = out;
> \
> +flags = iter_predtest_bwd(out, pg, flags);
> \
> +} while (i > 0);
> \
> +return flags;
> \
> +}
Why do we iterate backwards through the vector? As far as I can
see the pseudocode iterates forwards, and I don't think it
makes a difference to the result which way we go.
Otherwise
Reviewed-by: Peter Maydell
thanks
-- PMM