Re: [Qemu-devel] [PATCH v2 36/67] target/arm: Implement SVE Integer Compare - Vectors Group

2018-02-23 Thread Richard Henderson
On 02/23/2018 08:29 AM, Peter Maydell wrote:
> On 17 February 2018 at 18:22, Richard Henderson
>  wrote:
>> Signed-off-by: Richard Henderson 
>> ---
> 
>> diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
>> index 86cd792cdf..ae433861f8 100644
>> --- a/target/arm/sve_helper.c
>> +++ b/target/arm/sve_helper.c
>> @@ -46,14 +46,14 @@
>>   *
>>   * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
>>   * and bit 0 set if C is set.
>> - *
>> - * This is an iterative function, called for each Pd and Pg word
>> - * moving forward.
>>   */
>>
>>  /* For no G bits set, NZCV = C.  */
>>  #define PREDTEST_INIT  1
>>
>> +/* This is an iterative function, called for each Pd and Pg word
>> + * moving forward.
>> + */
> 
> Why move this comment?

Meant to fold this to the first.  But moving so that I can separately 
document...

>> +/* This is an iterative function, called for each Pd and Pg word
>> + * moving backward.
>> + */
>> +static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)

... this.

>> +do {
>>  \
>> +uint64_t out = 0, pg;   
>>  \
>> +do {
>>  \
>> +i -= sizeof(TYPE), out <<= sizeof(TYPE);
>>  \
>> +TYPE nn = *(TYPE *)(vn + H(i)); 
>>  \
>> +TYPE mm = *(TYPE *)(vm + H(i)); 
>>  \
>> +out |= nn OP mm;
>>  \
>> +} while (i & 63);   
>>  \
>> +pg = *(uint64_t *)(vg + (i >> 3)) & MASK;   
>>  \
>> +out &= pg;  
>>  \
>> +*(uint64_t *)(vd + (i >> 3)) = out; 
>>  \
>> +flags = iter_predtest_bwd(out, pg, flags);  
>>  \
>> +} while (i > 0);
>>  \
>> +return flags;   
>>  \
>> +}
> 
> Why do we iterate backwards through the vector? As far as I can
> see the pseudocode iterates forwards, and I don't think it
> makes a difference to the result which way we go.

You're right, it does not make a difference to the result which way we iterate.

Of the several different ways I've written loops over predicates, this is my
favorite.  It has several points in its favor:

  1) Operate on full uint64_t predicate units instead
 of uint8_t or uint16_t sub-units.  This means

 1a) No big-endian adjustment required,
 1b) Fewer memory loads.

  2) No separate loop tail; it is shared with the main loop body.

  3) A sub-point specific to predicate output, but the main loop
 gets to run un-predicated.  Here the governing predicate is
 applied at the end: out &= pg.


r~



Re: [Qemu-devel] [PATCH v2 36/67] target/arm: Implement SVE Integer Compare - Vectors Group

2018-02-23 Thread Peter Maydell
On 17 February 2018 at 18:22, Richard Henderson
 wrote:
> Signed-off-by: Richard Henderson 
> ---

> diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
> index 86cd792cdf..ae433861f8 100644
> --- a/target/arm/sve_helper.c
> +++ b/target/arm/sve_helper.c
> @@ -46,14 +46,14 @@
>   *
>   * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
>   * and bit 0 set if C is set.
> - *
> - * This is an iterative function, called for each Pd and Pg word
> - * moving forward.
>   */
>
>  /* For no G bits set, NZCV = C.  */
>  #define PREDTEST_INIT  1
>
> +/* This is an iterative function, called for each Pd and Pg word
> + * moving forward.
> + */

Why move this comment?

>  static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
>  {
>  if (likely(g)) {
> @@ -73,6 +73,28 @@ static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, 
> uint32_t flags)
>  return flags;
>  }
>
> +/* This is an iterative function, called for each Pd and Pg word
> + * moving backward.
> + */
> +static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
> +{
> +if (likely(g)) {
> +/* Compute C from first (i.e last) !(D & G).
> +   Use bit 2 to signal first G bit seen.  */
> +if (!(flags & 4)) {
> +flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
> +flags |= (d & pow2floor(g)) == 0;
> +}
> +
> +/* Accumulate Z from each D & G.  */
> +flags |= ((d & g) != 0) << 1;
> +
> +/* Compute N from last (i.e first) D & G.  Replace previous.  */
> +flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
> +}
> +return flags;
> +}
> +
>  /* The same for a single word predicate.  */
>  uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
>  {
> @@ -2180,3 +2202,168 @@ void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void 
> *vm,
>  d[i] = (pg[H1(i)] & 1 ? nn : mm);
>  }
>  }
> +
> +/* Two operand comparison controlled by a predicate.
> + * ??? It is very tempting to want to be able to expand this inline
> + * with x86 instructions, e.g.
> + *
> + *vcmpeqwzm, zn, %ymm0
> + *vpmovmskb  %ymm0, %eax
> + *and$0x, %eax
> + *andpg, %eax
> + *
> + * or even aarch64, e.g.
> + *
> + *// mask = 4000 1000 0400 0100 0040 0010 0004 0001
> + *cmeq   v0.8h, zn, zm
> + *andv0.8h, v0.8h, mask
> + *addv   h0, v0.8h
> + *andv0.8b, pg
> + *
> + * However, coming up with an abstraction that allows vector inputs and
> + * a scalar output, and also handles the byte-ordering of sub-uint64_t
> + * scalar outputs, is tricky.
> + */
> +#define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) 
> \
> +uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) 
> \
> +{
> \
> +intptr_t opr_sz = simd_oprsz(desc);  
> \
> +uint32_t flags = PREDTEST_INIT;  
> \
> +intptr_t i = opr_sz; 
> \
> +do { 
> \
> +uint64_t out = 0, pg;
> \
> +do { 
> \
> +i -= sizeof(TYPE), out <<= sizeof(TYPE); 
> \
> +TYPE nn = *(TYPE *)(vn + H(i));  
> \
> +TYPE mm = *(TYPE *)(vm + H(i));  
> \
> +out |= nn OP mm; 
> \
> +} while (i & 63);
> \
> +pg = *(uint64_t *)(vg + (i >> 3)) & MASK;
> \
> +out &= pg;   
> \
> +*(uint64_t *)(vd + (i >> 3)) = out;  
> \
> +flags = iter_predtest_bwd(out, pg, flags);   
> \
> +} while (i > 0); 
> \
> +return flags;
> \
> +}

Why do we iterate backwards through the vector? As far as I can
see the pseudocode iterates forwards, and I don't think it
makes a difference to the result which way we go.


Otherwise

Reviewed-by: Peter Maydell 

thanks
-- PMM