On Tue, Jul 13, 2021 at 4:59 PM Kewen.Lin <li...@linux.ibm.com> wrote: > > on 2021/7/13 下午8:42, Richard Biener wrote: > > On Tue, Jul 13, 2021 at 12:25 PM Kewen.Lin <li...@linux.ibm.com> wrote: > >> > >> Hi Richi, > >> > >> Thanks for the comments! > >> > >> on 2021/7/13 下午5:35, Richard Biener wrote: > >>> On Tue, Jul 13, 2021 at 10:53 AM Kewen.Lin <li...@linux.ibm.com> wrote: > >>>> > >>>> Hi, > >>>> > >>>> When I added the support for Power10 newly introduced multiply > >>>> highpart instrutions, I noticed that currently vectorizer > >>>> doesn't try to vectorize multiply highpart pattern, I hope > >>>> this isn't intentional? > >>>> > >>>> This patch is to extend the existing pattern mulhs handlings > >>>> to cover multiply highpart. Another alternative seems to > >>>> recog mul_highpart operation in a general place applied for > >>>> scalar code when the target supports the optab for the scalar > >>>> operation, it's based on the assumption that one target which > >>>> supports vector version of multiply highpart should have the > >>>> scalar version. I noticed that the function can_mult_highpart_p > >>>> can check/handle mult_highpart well even without mul_highpart > >>>> optab support, I think to recog this pattern in vectorizer > >>>> is better. Is it on the right track? > >>> > >>> I think it's on the right track, using IFN_LAST is a bit awkward > >>> in case yet another case pops up so maybe you can use > >>> a code_helper instance instead which unifies tree_code, > >>> builtin_code and internal_fn? > >>> > >> > >> If there is one new requirement which doesn't have/introduce IFN > >> stuffs but have one existing tree_code, can we add one more field > >> with type tree_code, then for the IFN_LAST path we can check the > >> different requirements under the guard with that tree_code variable? > >> > >>> I also notice that can_mult_highpart_p will return true if > >>> only vec_widen_[us]mult_{even,odd,hi,lo} are available, > >>> but then the result might be less optimal (or even not > >>> handled later)? > >>> > >> > >> I think it will be handled always? The expander calls > >> > >> rtx > >> expand_mult_highpart (machine_mode mode, rtx op0, rtx op1, > >> rtx target, bool uns_p) > >> > >> which will further check with can_mult_highpart_p. > >> > >> For the below case, > >> > >> #define SHT_CNT 16 > >> > >> __attribute__ ((noipa)) void > >> test () > >> { > >> for (int i = 0; i < N; i++) > >> sh_c[i] = ((SI) sh_a[i] * (SI) sh_b[i]) >> 16; > >> } > >> > >> Without this patch, it use widen_mult like below: > >> > >> vect__1.5_19 = MEM <vector(8) short int> [(short int *)&sh_a + > >> ivtmp.18_24 * 1]; > >> vect__3.8_14 = MEM <vector(8) short int> [(short int *)&sh_b + > >> ivtmp.18_24 * 1]; > >> vect_patt_22.9_13 = WIDEN_MULT_LO_EXPR <vect__3.8_14, vect__1.5_19>; > >> vect_patt_22.9_9 = WIDEN_MULT_HI_EXPR <vect__3.8_14, vect__1.5_19>; > >> vect__6.10_25 = vect_patt_22.9_13 >> 16; > >> vect__6.10_26 = vect_patt_22.9_9 >> 16; > >> vect__7.11_27 = VEC_PACK_TRUNC_EXPR <vect__6.10_25, vect__6.10_26>; > >> MEM <vector(8) short int> [(short int *)&sh_c + ivtmp.18_24 * 1] = > >> vect__7.11_27; > >> > >> .L2: > >> lxvx 33,7,9 > >> lxvx 32,8,9 > >> vmulosh 13,0,1 // widen mult > >> vmulesh 0,0,1 > >> xxmrglw 33,32,45 // merge > >> xxmrghw 32,32,45 > >> vsraw 1,1,12 // shift > >> vsraw 0,0,12 > >> vpkuwum 0,0,1 // pack > >> stxvx 32,10,9 > >> addi 9,9,16 > >> bdnz .L2 > >> > >> > >> With this patch, it ends up with: > >> > >> vect__1.5_14 = MEM <vector(8) short int> [(short int *)&sh_a + > >> ivtmp.17_24 * 1]; > >> vect__3.8_8 = MEM <vector(8) short int> [(short int *)&sh_b + > >> ivtmp.17_24 * 1]; > >> vect_patt_21.9_25 = vect__3.8_8 h* vect__1.5_14; > >> MEM <vector(8) short int> [(short int *)&sh_c + ivtmp.17_24 * 1] = > >> vect_patt_21.9_25; > > > > Yes, so I'm curious what it ends up with/without the patch on x86_64 which > > can do vec_widen_[us]mult_{even,odd} but not [us]mul_highpart. > > > > For test case: > > ``` > #define N 32 > typedef signed int bigType; > typedef signed short smallType; > #define SH_CNT 16 > > extern smallType small_a[N], small_b[N], small_c[N]; > > __attribute__((noipa)) void test_si(int n) { > for (int i = 0; i < n; i++) > small_c[i] = ((bigType)small_a[i] * (bigType)small_b[i]) >> SH_CNT; > } > > ``` > > on x86_64, with option set: -mfpmath=sse -msse2 -O2 -ftree-vectorize > > 1) without this patch, the pattern isn't recognized, the IR looks like: > > <bb 4> [local count: 94607391]: > bnd.5_34 = niters.4_25 >> 3; > _13 = (sizetype) bnd.5_34; > _29 = _13 * 16; > > <bb 5> [local count: 378429566]: > # ivtmp.34_4 = PHI <ivtmp.34_5(5), 0(4)> > vect__1.10_40 = MEM <vector(8) short int> [(short int *)&small_a + > ivtmp.34_4 * 1]; > vect__3.13_43 = MEM <vector(8) short int> [(short int *)&small_b + > ivtmp.34_4 * 1]; > vect_patt_18.14_44 = WIDEN_MULT_LO_EXPR <vect__1.10_40, vect__3.13_43>; > vect_patt_18.14_45 = WIDEN_MULT_HI_EXPR <vect__1.10_40, vect__3.13_43>; > vect__6.15_46 = vect_patt_18.14_44 >> 16; > vect__6.15_47 = vect_patt_18.14_45 >> 16; > vect__7.16_48 = VEC_PACK_TRUNC_EXPR <vect__6.15_46, vect__6.15_47>; > MEM <vector(8) short int> [(short int *)&small_c + ivtmp.34_4 * 1] = > vect__7.16_48; > ivtmp.34_5 = ivtmp.34_4 + 16; > if (ivtmp.34_5 != _29) > goto <bb 5>; [75.00%] > else > goto <bb 6>; [25.00%] > > ... > > *asm*: > > .L4: > movdqu small_b(%rax), %xmm3 > movdqu small_a(%rax), %xmm1 > addq $16, %rax > movdqu small_a-16(%rax), %xmm2 > pmullw %xmm3, %xmm1 > pmulhw %xmm3, %xmm2 > movdqa %xmm1, %xmm0 > punpcklwd %xmm2, %xmm0 > punpckhwd %xmm2, %xmm1 > psrad $16, %xmm1 > psrad $16, %xmm0 > movdqa %xmm0, %xmm2 > punpcklwd %xmm1, %xmm0 > punpckhwd %xmm1, %xmm2 > movdqa %xmm0, %xmm1 > punpckhwd %xmm2, %xmm1 > punpcklwd %xmm2, %xmm0 > punpcklwd %xmm1, %xmm0 > movups %xmm0, small_c-16(%rax) > cmpq %rdx, %rax > jne .L4 > movl %edi, %eax > andl $-8, %eax > testb $7, %dil > je .L14 > > *insn dist in loop* > > 1 addq > 3 movdqa > 3 movdqu > 1 movups > 1 pmulhw > 1 pmullw > 2 psrad > 3 punpckhwd > 4 punpcklwd > > > 2) with this patch but make the mul_highpart optab query return false, the IR > looks > like: > > <bb 4> [local count: 94607391]: > bnd.5_37 = niters.4_22 >> 3; > _13 = (sizetype) bnd.5_37; > _32 = _13 * 16; > > <bb 5> [local count: 378429566]: > # ivtmp.33_4 = PHI <ivtmp.33_5(5), 0(4)> > vect__1.10_43 = MEM <vector(8) short int> [(short int *)&small_a + > ivtmp.33_4 * 1]; > vect__3.13_46 = MEM <vector(8) short int> [(short int *)&small_b + > ivtmp.33_4 * 1]; > vect_patt_26.14_47 = vect__1.10_43 h* vect__3.13_46; > MEM <vector(8) short int> [(short int *)&small_c + ivtmp.33_4 * 1] = > vect_patt_26.14_47; > ivtmp.33_5 = ivtmp.33_4 + 16; > if (ivtmp.33_5 != _32) > goto <bb 5>; [75.00%] > else > goto <bb 6>; [25.00%] > > *asm*: > > .L4: > movdqu small_b(%rax), %xmm3 > movdqu small_a(%rax), %xmm1 > addq $16, %rax > movdqu small_a-16(%rax), %xmm2 > pmullw %xmm3, %xmm1 > pmulhw %xmm3, %xmm2 > movdqa %xmm1, %xmm0 > punpcklwd %xmm2, %xmm0 > punpckhwd %xmm2, %xmm1 > movdqa %xmm0, %xmm2 > punpcklwd %xmm1, %xmm0 > punpckhwd %xmm1, %xmm2 > movdqa %xmm0, %xmm1 > punpckhwd %xmm2, %xmm1 > punpcklwd %xmm2, %xmm0 > punpckhwd %xmm1, %xmm0 > movups %xmm0, small_c-16(%rax) > cmpq %rdx, %rax > jne .L4 > movl %edi, %eax > andl $-8, %eax > testb $7, %dil > je .L14 > > *insn dist*: > > 1 addq > 3 movdqa > 3 movdqu > 1 movups > 1 pmulhw > 1 pmullw > 4 punpckhwd > 3 punpcklwd > > I know little on i386 asm, but this seems better from insn distribution, > the one without this patch uses two more psrad (s).
So the advantage of 1) would be more appropriate costing in the vectorizer (x86 does not have a native vector highpart multiply). > 3) FWIW with this patch as well as existing optab supports, the IR looks like: > > <bb 4> [local count: 94607391]: > bnd.5_40 = niters.4_19 >> 3; > _75 = (sizetype) bnd.5_40; > _91 = _75 * 16; > > <bb 5> [local count: 378429566]: > # ivtmp.48_53 = PHI <ivtmp.48_45(5), 0(4)> > vect__1.10_48 = MEM <vector(8) short int> [(short int *)&small_a + > ivtmp.48_53 * 1]; > vect__3.13_51 = MEM <vector(8) short int> [(short int *)&small_b + > ivtmp.48_53 * 1]; > vect_patt_26.14_52 = vect__1.10_48 h* vect__3.13_51; > MEM <vector(8) short int> [(short int *)&small_c + ivtmp.48_53 * 1] = > vect_patt_26.14_52; > ivtmp.48_45 = ivtmp.48_53 + 16; > if (ivtmp.48_45 != _91) > goto <bb 5>; [75.00%] > else > goto <bb 6>; [25.00%] > > <bb 6> [local count: 94607391]: > niters_vector_mult_vf.6_41 = niters.4_19 & 4294967288; > tmp.7_42 = (int) niters_vector_mult_vf.6_41; > _61 = niters.4_19 & 7; > if (_61 == 0) > goto <bb 12>; [12.50%] > else > goto <bb 7>; [87.50%] > > <bb 7> [local count: 93293400]: > # i_38 = PHI <tmp.7_42(6), 0(3)> > # _44 = PHI <niters_vector_mult_vf.6_41(6), 0(3)> > niters.18_60 = niters.4_19 - _44; > _76 = niters.18_60 + 4294967295; > if (_76 <= 2) > goto <bb 9>; [10.00%] > else > goto <bb 8>; [90.00%] > > <bb 8> [local count: 167928121]: > _85 = (sizetype) _44; > _86 = _85 * 2; > vectp_small_a.23_84 = &small_a + _86; > vectp_small_b.26_90 = &small_b + _86; > vectp_small_c.31_98 = &small_c + _86; > vect__9.24_89 = MEM <vector(4) short int> [(short int > *)vectp_small_a.23_84]; > vect__28.27_95 = MEM <vector(4) short int> [(short int > *)vectp_small_b.26_90]; > vect_patt_23.28_96 = vect__9.24_89 h* vect__28.27_95; > MEM <vector(4) short int> [(short int *)vectp_small_c.31_98] = > vect_patt_23.28_96; > niters_vector_mult_vf.20_80 = niters.18_60 & 4294967292; > _82 = (int) niters_vector_mult_vf.20_80; > tmp.21_81 = i_38 + _82; > _74 = niters.18_60 & 3; > if (_74 == 0) > goto <bb 11>; [25.00%] > else > goto <bb 9>; [75.00%] > > ... > > *asm*: > > .L4: > movdqu small_a(%rax), %xmm0 > movdqu small_b(%rax), %xmm2 > addq $16, %rax > pmulhw %xmm2, %xmm0 > movups %xmm0, small_c-16(%rax) > cmpq %rdx, %rax > jne .L4 > movl %ecx, %edx > andl $-8, %edx > movl %edx, %eax > testb $7, %cl > je .L19 > .L3: > movl %ecx, %esi > subl %edx, %esi > leal -1(%rsi), %edi > cmpl $2, %edi > jbe .L6 > movq small_a(%rdx,%rdx), %xmm0 > movq small_b(%rdx,%rdx), %xmm1 > pmulhw %xmm1, %xmm0 > movq %xmm0, small_c(%rdx,%rdx) > movl %esi, %edx > andl $-4, %edx > addl %edx, %eax > andl $3, %esi > je .L1 Oh, so indeed x86 has a vector highpart multiply, not grep-friendly macroized as <s>mul<mode>3_highpart ;) > I guess the proposed IFN would be directly mapped for [us]mul_highpart? Yes. > or do you expect it will also cover what we support in can_mult_highpart_p? See above - since we manage to use widen_mult we should expose this detail for the purpose of costing. So the pattern would directly ask for an optab IFN mapping to [us]mul_highpart. Richard. > BR, > Kewen