Richard Biener <richard.guent...@gmail.com> writes:
> On Thu, Sep 24, 2020 at 9:38 PM Segher Boessenkool
> <seg...@kernel.crashing.org> wrote:
>>
>> Hi!
>>
>> On Thu, Sep 24, 2020 at 04:55:21PM +0200, Richard Biener wrote:
>> > Btw, on x86_64 the following produces sth reasonable:
>> >
>> > #define N 32
>> > typedef int T;
>> > typedef T V __attribute__((vector_size(N)));
>> > V setg (V v, int idx, T val)
>> > {
>> >   V valv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
>> >   V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == valv);
>> >   v = (v & ~mask) | (valv & mask);
>> >   return v;
>> > }
>> >
>> >         vmovd   %edi, %xmm1
>> >         vpbroadcastd    %xmm1, %ymm1
>> >         vpcmpeqd        .LC0(%rip), %ymm1, %ymm2
>> >         vpblendvb       %ymm2, %ymm1, %ymm0, %ymm0
>> >         ret
>> >
>> > I'm quite sure you could do sth similar on power?
>>
>> This only allows inserting aligned elements.  Which is probably fine
>> of course (we don't allow elements that straddle vector boundaries
>> either, anyway).
>>
>> And yes, we can do that :-)
>>
>> That should be
>>   #define N 32
>>   typedef int T;
>>   typedef T V __attribute__((vector_size(N)));
>>   V setg (V v, int idx, T val)
>>   {
>>     V valv = (V){val, val, val, val, val, val, val, val};
>>     V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
>>     V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
>>     v = (v & ~mask) | (valv & mask);
>>     return v;
>>   }
>
> Whoops yeah, simplified it a bit too much ;)
>
>> after which I get (-march=znver2)
>>
>> setg:
>>         vmovd   %edi, %xmm1
>>         vmovd   %esi, %xmm2
>>         vpbroadcastd    %xmm1, %ymm1
>>         vpbroadcastd    %xmm2, %ymm2
>>         vpcmpeqd        .LC0(%rip), %ymm1, %ymm1
>>         vpandn  %ymm0, %ymm1, %ymm0
>>         vpand   %ymm2, %ymm1, %ymm1
>>         vpor    %ymm0, %ymm1, %ymm0
>>         ret
>
> I get with -march=znver2 -O2
>
>         vmovd   %edi, %xmm1
>         vmovd   %esi, %xmm2
>         vpbroadcastd    %xmm1, %ymm1
>         vpbroadcastd    %xmm2, %ymm2
>         vpcmpeqd        .LC0(%rip), %ymm1, %ymm1
>         vpblendvb       %ymm1, %ymm2, %ymm0, %ymm0
>
> and with -mavx512vl
>
>         vpbroadcastd    %edi, %ymm1
>         vpcmpd  $0, .LC0(%rip), %ymm1, %k1
>         vpbroadcastd    %esi, %ymm0{%k1}
>
> broadcast-with-mask - heh, would be interesting if we manage
> to combine v[idx1] = val; v[idx2] = val; ;)
>
> Now, with SSE4.2 the 16byte case compiles to
>
> setg:
> .LFB0:
>         .cfi_startproc
>         movd    %edi, %xmm3
>         movdqa  %xmm0, %xmm1
>         movd    %esi, %xmm4
>         pshufd  $0, %xmm3, %xmm0
>         pcmpeqd .LC0(%rip), %xmm0
>         movdqa  %xmm0, %xmm2
>         pandn   %xmm1, %xmm2
>         pshufd  $0, %xmm4, %xmm1
>         pand    %xmm1, %xmm0
>         por     %xmm2, %xmm0
>         ret
>
> since there's no blend with a variable mask IIRC.
>
> with aarch64 and SVE it doesn't handle the 32byte case at all,

FWIW, the SVE version with -msve-vector-bits=256 is:

        ptrue   p0.b, vl32
        mov     z1.s, w1
        index   z2.s, #0, #1
        ld1w    z0.s, p0/z, [x0]
        cmpeq   p1.s, p0/z, z1.s, z2.s
        mov     z0.s, p1/m, w2
        st1w    z0.s, p0, [x8]

where the ptrue, ld1w and st1w are just because generic 256-bit
vectors are passed in memory; the real operation is:

        mov     z1.s, w1
        index   z2.s, #0, #1
        cmpeq   p1.s, p0/z, z1.s, z2.s
        mov     z0.s, p1/m, w2

Thanks,
Richard

Reply via email to