On Tue, Jul 13, 2021 at 4:59 PM Kewen.Lin <li...@linux.ibm.com> wrote:
>
> on 2021/7/13 下午8:42, Richard Biener wrote:
> > On Tue, Jul 13, 2021 at 12:25 PM Kewen.Lin <li...@linux.ibm.com> wrote:
> >>
> >> Hi Richi,
> >>
> >> Thanks for the comments!
> >>
> >> on 2021/7/13 下午5:35, Richard Biener wrote:
> >>> On Tue, Jul 13, 2021 at 10:53 AM Kewen.Lin <li...@linux.ibm.com> wrote:
> >>>>
> >>>> Hi,
> >>>>
> >>>> When I added the support for Power10 newly introduced multiply
> >>>> highpart instrutions, I noticed that currently vectorizer
> >>>> doesn't try to vectorize multiply highpart pattern, I hope
> >>>> this isn't intentional?
> >>>>
> >>>> This patch is to extend the existing pattern mulhs handlings
> >>>> to cover multiply highpart.  Another alternative seems to
> >>>> recog mul_highpart operation in a general place applied for
> >>>> scalar code when the target supports the optab for the scalar
> >>>> operation, it's based on the assumption that one target which
> >>>> supports vector version of multiply highpart should have the
> >>>> scalar version.  I noticed that the function can_mult_highpart_p
> >>>> can check/handle mult_highpart well even without mul_highpart
> >>>> optab support, I think to recog this pattern in vectorizer
> >>>> is better.  Is it on the right track?
> >>>
> >>> I think it's on the right track, using IFN_LAST is a bit awkward
> >>> in case yet another case pops up so maybe you can use
> >>> a code_helper instance instead which unifies tree_code,
> >>> builtin_code and internal_fn?
> >>>
> >>
> >> If there is one new requirement which doesn't have/introduce IFN
> >> stuffs but have one existing tree_code, can we add one more field
> >> with type tree_code, then for the IFN_LAST path we can check the
> >> different requirements under the guard with that tree_code variable?
> >>
> >>> I also notice that can_mult_highpart_p will return true if
> >>> only vec_widen_[us]mult_{even,odd,hi,lo} are available,
> >>> but then the result might be less optimal (or even not
> >>> handled later)?
> >>>
> >>
> >> I think it will be handled always?  The expander calls
> >>
> >> rtx
> >> expand_mult_highpart (machine_mode mode, rtx op0, rtx op1,
> >>                       rtx target, bool uns_p)
> >>
> >> which will further check with can_mult_highpart_p.
> >>
> >> For the below case,
> >>
> >> #define SHT_CNT 16
> >>
> >> __attribute__ ((noipa)) void
> >> test ()
> >> {
> >>   for (int i = 0; i < N; i++)
> >>     sh_c[i] = ((SI) sh_a[i] * (SI) sh_b[i]) >> 16;
> >> }
> >>
> >> Without this patch, it use widen_mult like below:
> >>
> >>   vect__1.5_19 = MEM <vector(8) short int> [(short int *)&sh_a + 
> >> ivtmp.18_24 * 1];
> >>   vect__3.8_14 = MEM <vector(8) short int> [(short int *)&sh_b + 
> >> ivtmp.18_24 * 1];
> >>   vect_patt_22.9_13 = WIDEN_MULT_LO_EXPR <vect__3.8_14, vect__1.5_19>;
> >>   vect_patt_22.9_9 = WIDEN_MULT_HI_EXPR <vect__3.8_14, vect__1.5_19>;
> >>   vect__6.10_25 = vect_patt_22.9_13 >> 16;
> >>   vect__6.10_26 = vect_patt_22.9_9 >> 16;
> >>   vect__7.11_27 = VEC_PACK_TRUNC_EXPR <vect__6.10_25, vect__6.10_26>;
> >>   MEM <vector(8) short int> [(short int *)&sh_c + ivtmp.18_24 * 1] = 
> >> vect__7.11_27;
> >>
> >> .L2:
> >>         lxvx 33,7,9
> >>         lxvx 32,8,9
> >>         vmulosh 13,0,1    // widen mult
> >>         vmulesh 0,0,1
> >>         xxmrglw 33,32,45  // merge
> >>         xxmrghw 32,32,45
> >>         vsraw 1,1,12      // shift
> >>         vsraw 0,0,12
> >>         vpkuwum 0,0,1     // pack
> >>         stxvx 32,10,9
> >>         addi 9,9,16
> >>         bdnz .L2
> >>
> >>
> >> With this patch, it ends up with:
> >>
> >>   vect__1.5_14 = MEM <vector(8) short int> [(short int *)&sh_a + 
> >> ivtmp.17_24 * 1];
> >>   vect__3.8_8 = MEM <vector(8) short int> [(short int *)&sh_b + 
> >> ivtmp.17_24 * 1];
> >>   vect_patt_21.9_25 = vect__3.8_8 h* vect__1.5_14;
> >>   MEM <vector(8) short int> [(short int *)&sh_c + ivtmp.17_24 * 1] = 
> >> vect_patt_21.9_25;
> >
> > Yes, so I'm curious what it ends up with/without the patch on x86_64 which
> > can do vec_widen_[us]mult_{even,odd} but not [us]mul_highpart.
> >
>
> For test case:
>
> ```
> #define N 32
> typedef signed int bigType;
> typedef signed short smallType;
> #define SH_CNT 16
>
> extern smallType small_a[N], small_b[N], small_c[N];
>
> __attribute__((noipa)) void test_si(int n) {
>   for (int i = 0; i < n; i++)
>     small_c[i] = ((bigType)small_a[i] * (bigType)small_b[i]) >> SH_CNT;
> }
>
> ```
>
> on x86_64, with option set: -mfpmath=sse -msse2 -O2 -ftree-vectorize
>
> 1) without this patch, the pattern isn't recognized, the IR looks like:
>
>   <bb 4> [local count: 94607391]:
>   bnd.5_34 = niters.4_25 >> 3;
>   _13 = (sizetype) bnd.5_34;
>   _29 = _13 * 16;
>
>   <bb 5> [local count: 378429566]:
>   # ivtmp.34_4 = PHI <ivtmp.34_5(5), 0(4)>
>   vect__1.10_40 = MEM <vector(8) short int> [(short int *)&small_a + 
> ivtmp.34_4 * 1];
>   vect__3.13_43 = MEM <vector(8) short int> [(short int *)&small_b + 
> ivtmp.34_4 * 1];
>   vect_patt_18.14_44 = WIDEN_MULT_LO_EXPR <vect__1.10_40, vect__3.13_43>;
>   vect_patt_18.14_45 = WIDEN_MULT_HI_EXPR <vect__1.10_40, vect__3.13_43>;
>   vect__6.15_46 = vect_patt_18.14_44 >> 16;
>   vect__6.15_47 = vect_patt_18.14_45 >> 16;
>   vect__7.16_48 = VEC_PACK_TRUNC_EXPR <vect__6.15_46, vect__6.15_47>;
>   MEM <vector(8) short int> [(short int *)&small_c + ivtmp.34_4 * 1] = 
> vect__7.16_48;
>   ivtmp.34_5 = ivtmp.34_4 + 16;
>   if (ivtmp.34_5 != _29)
>     goto <bb 5>; [75.00%]
>   else
>     goto <bb 6>; [25.00%]
>
> ...
>
> *asm*:
>
> .L4:
>         movdqu  small_b(%rax), %xmm3
>         movdqu  small_a(%rax), %xmm1
>         addq    $16, %rax
>         movdqu  small_a-16(%rax), %xmm2
>         pmullw  %xmm3, %xmm1
>         pmulhw  %xmm3, %xmm2
>         movdqa  %xmm1, %xmm0
>         punpcklwd       %xmm2, %xmm0
>         punpckhwd       %xmm2, %xmm1
>         psrad   $16, %xmm1
>         psrad   $16, %xmm0
>         movdqa  %xmm0, %xmm2
>         punpcklwd       %xmm1, %xmm0
>         punpckhwd       %xmm1, %xmm2
>         movdqa  %xmm0, %xmm1
>         punpckhwd       %xmm2, %xmm1
>         punpcklwd       %xmm2, %xmm0
>         punpcklwd       %xmm1, %xmm0
>         movups  %xmm0, small_c-16(%rax)
>         cmpq    %rdx, %rax
>         jne     .L4
>         movl    %edi, %eax
>         andl    $-8, %eax
>         testb   $7, %dil
>         je      .L14
>
> *insn dist in loop*
>
>       1 addq
>       3 movdqa
>       3 movdqu
>       1 movups
>       1 pmulhw
>       1 pmullw
>       2 psrad
>       3 punpckhwd
>       4 punpcklwd
>
>
> 2) with this patch but make the mul_highpart optab query return false, the IR 
> looks
> like:
>
>   <bb 4> [local count: 94607391]:
>   bnd.5_37 = niters.4_22 >> 3;
>   _13 = (sizetype) bnd.5_37;
>   _32 = _13 * 16;
>
>   <bb 5> [local count: 378429566]:
>   # ivtmp.33_4 = PHI <ivtmp.33_5(5), 0(4)>
>   vect__1.10_43 = MEM <vector(8) short int> [(short int *)&small_a + 
> ivtmp.33_4 * 1];
>   vect__3.13_46 = MEM <vector(8) short int> [(short int *)&small_b + 
> ivtmp.33_4 * 1];
>   vect_patt_26.14_47 = vect__1.10_43 h* vect__3.13_46;
>   MEM <vector(8) short int> [(short int *)&small_c + ivtmp.33_4 * 1] = 
> vect_patt_26.14_47;
>   ivtmp.33_5 = ivtmp.33_4 + 16;
>   if (ivtmp.33_5 != _32)
>     goto <bb 5>; [75.00%]
>   else
>     goto <bb 6>; [25.00%]
>
> *asm*:
>
> .L4:
>         movdqu  small_b(%rax), %xmm3
>         movdqu  small_a(%rax), %xmm1
>         addq    $16, %rax
>         movdqu  small_a-16(%rax), %xmm2
>         pmullw  %xmm3, %xmm1
>         pmulhw  %xmm3, %xmm2
>         movdqa  %xmm1, %xmm0
>         punpcklwd       %xmm2, %xmm0
>         punpckhwd       %xmm2, %xmm1
>         movdqa  %xmm0, %xmm2
>         punpcklwd       %xmm1, %xmm0
>         punpckhwd       %xmm1, %xmm2
>         movdqa  %xmm0, %xmm1
>         punpckhwd       %xmm2, %xmm1
>         punpcklwd       %xmm2, %xmm0
>         punpckhwd       %xmm1, %xmm0
>         movups  %xmm0, small_c-16(%rax)
>         cmpq    %rdx, %rax
>         jne     .L4
>         movl    %edi, %eax
>         andl    $-8, %eax
>         testb   $7, %dil
>         je      .L14
>
> *insn dist*:
>
>       1 addq
>       3 movdqa
>       3 movdqu
>       1 movups
>       1 pmulhw
>       1 pmullw
>       4 punpckhwd
>       3 punpcklwd
>
> I know little on i386 asm, but this seems better from insn distribution,
> the one without this patch uses two more psrad (s).

So the advantage of 1) would be more appropriate costing in the vectorizer
(x86 does not have a native vector highpart multiply).

> 3) FWIW with this patch as well as existing optab supports, the IR looks like:
>
>   <bb 4> [local count: 94607391]:
>   bnd.5_40 = niters.4_19 >> 3;
>   _75 = (sizetype) bnd.5_40;
>   _91 = _75 * 16;
>
>   <bb 5> [local count: 378429566]:
>   # ivtmp.48_53 = PHI <ivtmp.48_45(5), 0(4)>
>   vect__1.10_48 = MEM <vector(8) short int> [(short int *)&small_a + 
> ivtmp.48_53 * 1];
>   vect__3.13_51 = MEM <vector(8) short int> [(short int *)&small_b + 
> ivtmp.48_53 * 1];
>   vect_patt_26.14_52 = vect__1.10_48 h* vect__3.13_51;
>   MEM <vector(8) short int> [(short int *)&small_c + ivtmp.48_53 * 1] = 
> vect_patt_26.14_52;
>   ivtmp.48_45 = ivtmp.48_53 + 16;
>   if (ivtmp.48_45 != _91)
>     goto <bb 5>; [75.00%]
>   else
>     goto <bb 6>; [25.00%]
>
>   <bb 6> [local count: 94607391]:
>   niters_vector_mult_vf.6_41 = niters.4_19 & 4294967288;
>   tmp.7_42 = (int) niters_vector_mult_vf.6_41;
>   _61 = niters.4_19 & 7;
>   if (_61 == 0)
>     goto <bb 12>; [12.50%]
>   else
>     goto <bb 7>; [87.50%]
>
>   <bb 7> [local count: 93293400]:
>   # i_38 = PHI <tmp.7_42(6), 0(3)>
>   # _44 = PHI <niters_vector_mult_vf.6_41(6), 0(3)>
>   niters.18_60 = niters.4_19 - _44;
>   _76 = niters.18_60 + 4294967295;
>   if (_76 <= 2)
>     goto <bb 9>; [10.00%]
>   else
>     goto <bb 8>; [90.00%]
>
>   <bb 8> [local count: 167928121]:
>   _85 = (sizetype) _44;
>   _86 = _85 * 2;
>   vectp_small_a.23_84 = &small_a + _86;
>   vectp_small_b.26_90 = &small_b + _86;
>   vectp_small_c.31_98 = &small_c + _86;
>   vect__9.24_89 = MEM <vector(4) short int> [(short int 
> *)vectp_small_a.23_84];
>   vect__28.27_95 = MEM <vector(4) short int> [(short int 
> *)vectp_small_b.26_90];
>   vect_patt_23.28_96 = vect__9.24_89 h* vect__28.27_95;
>   MEM <vector(4) short int> [(short int *)vectp_small_c.31_98] = 
> vect_patt_23.28_96;
>   niters_vector_mult_vf.20_80 = niters.18_60 & 4294967292;
>   _82 = (int) niters_vector_mult_vf.20_80;
>   tmp.21_81 = i_38 + _82;
>   _74 = niters.18_60 & 3;
>   if (_74 == 0)
>     goto <bb 11>; [25.00%]
>   else
>     goto <bb 9>; [75.00%]
>
> ...
>
> *asm*:
>
> .L4:
>         movdqu  small_a(%rax), %xmm0
>         movdqu  small_b(%rax), %xmm2
>         addq    $16, %rax
>         pmulhw  %xmm2, %xmm0
>         movups  %xmm0, small_c-16(%rax)
>         cmpq    %rdx, %rax
>         jne     .L4
>         movl    %ecx, %edx
>         andl    $-8, %edx
>         movl    %edx, %eax
>         testb   $7, %cl
>         je      .L19
> .L3:
>         movl    %ecx, %esi
>         subl    %edx, %esi
>         leal    -1(%rsi), %edi
>         cmpl    $2, %edi
>         jbe     .L6
>         movq    small_a(%rdx,%rdx), %xmm0
>         movq    small_b(%rdx,%rdx), %xmm1
>         pmulhw  %xmm1, %xmm0
>         movq    %xmm0, small_c(%rdx,%rdx)
>         movl    %esi, %edx
>         andl    $-4, %edx
>         addl    %edx, %eax
>         andl    $3, %esi
>         je      .L1

Oh, so indeed x86 has a vector highpart multiply, not grep-friendly
macroized as <s>mul<mode>3_highpart ;)

> I guess the proposed IFN would be directly mapped for [us]mul_highpart?

Yes.

> or do you expect it will also cover what we support in can_mult_highpart_p?

See above - since we manage to use widen_mult we should expose this
detail for the purpose of costing.  So the pattern would directly ask for
an optab IFN mapping to [us]mul_highpart.

Richard.

> BR,
> Kewen

Reply via email to