Forget to attach the patch file.
thanks, Cong On Wed, Oct 30, 2013 at 10:01 AM, Cong Hou <co...@google.com> wrote: > I found my problem: I put DONE outside of if not inside. You are > right. I have updated my patch. > > I appreciate your comment and test on it! > > > thanks, > Cong > > > > diff --git a/gcc/ChangeLog b/gcc/ChangeLog > index 8a38316..84c7ab5 100644 > --- a/gcc/ChangeLog > +++ b/gcc/ChangeLog > @@ -1,3 +1,10 @@ > +2013-10-22 Cong Hou <co...@google.com> > + > + PR target/58762 > + * config/i386/i386-protos.h (ix86_expand_sse2_abs): New function. > + * config/i386/i386.c (ix86_expand_sse2_abs): New function. > + * config/i386/sse.md: Add SSE2 support to abs (8/16/32-bit-int). > + > 2013-10-14 David Malcolm <dmalc...@redhat.com> > > * dumpfile.h (gcc::dump_manager): New class, to hold state > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h > index 3ab2f3a..ca31224 100644 > --- a/gcc/config/i386/i386-protos.h > +++ b/gcc/config/i386/i386-protos.h > @@ -238,6 +238,7 @@ extern void ix86_expand_mul_widen_evenodd (rtx, > rtx, rtx, bool, bool); > extern void ix86_expand_mul_widen_hilo (rtx, rtx, rtx, bool, bool); > extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx); > extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx); > +extern void ix86_expand_sse2_abs (rtx, rtx); > > /* In i386-c.c */ > extern void ix86_target_macros (void); > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index 02cbbbd..71905fc 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -41696,6 +41696,53 @@ ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2) > gen_rtx_MULT (mode, op1, op2)); > } > > +void > +ix86_expand_sse2_abs (rtx op0, rtx op1) > +{ > + enum machine_mode mode = GET_MODE (op0); > + rtx tmp0, tmp1; > + > + switch (mode) > + { > + /* For 32-bit signed integer X, the best way to calculate the absolute > + value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */ > + case V4SImode: > + tmp0 = expand_simple_binop (mode, ASHIFTRT, op1, > + GEN_INT (GET_MODE_BITSIZE > + (GET_MODE_INNER (mode)) - 1), > + NULL, 0, OPTAB_DIRECT); > + if (tmp0) > + tmp1 = expand_simple_binop (mode, XOR, op1, tmp0, > + NULL, 0, OPTAB_DIRECT); > + if (tmp0 && tmp1) > + expand_simple_binop (mode, MINUS, tmp1, tmp0, > + op0, 0, OPTAB_DIRECT); > + break; > + > + /* For 16-bit signed integer X, the best way to calculate the absolute > + value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */ > + case V8HImode: > + tmp0 = expand_unop (mode, neg_optab, op1, NULL_RTX, 0); > + if (tmp0) > + expand_simple_binop (mode, SMAX, op1, tmp0, op0, 0, > + OPTAB_DIRECT); > + break; > + > + /* For 8-bit signed integer X, the best way to calculate the absolute > + value of X is min ((unsigned char) X, (unsigned char) (-X)), > + as SSE2 provides the PMINUB insn. */ > + case V16QImode: > + tmp0 = expand_unop (mode, neg_optab, op1, NULL_RTX, 0); > + if (tmp0) > + expand_simple_binop (V16QImode, UMIN, op1, tmp0, op0, 0, > + OPTAB_DIRECT); > + break; > + > + default: > + break; > + } > +} > + > /* Expand an insert into a vector register through pinsr insn. > Return true if successful. */ > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > index c3f6c94..46e1df4 100644 > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -8721,7 +8721,7 @@ > (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p > (insn)")) > (set_attr "mode" "DI")]) > > -(define_insn "abs<mode>2" > +(define_insn "*abs<mode>2" > [(set (match_operand:VI124_AVX2_48_AVX512F 0 "register_operand" "=v") > (abs:VI124_AVX2_48_AVX512F > (match_operand:VI124_AVX2_48_AVX512F 1 "nonimmediate_operand" "vm")))] > @@ -8733,6 +8733,19 @@ > (set_attr "prefix" "maybe_vex") > (set_attr "mode" "<sseinsnmode>")]) > > +(define_expand "abs<mode>2" > + [(set (match_operand:VI124_AVX2_48_AVX512F 0 "register_operand") > + (abs:VI124_AVX2_48_AVX512F > + (match_operand:VI124_AVX2_48_AVX512F 1 "nonimmediate_operand")))] > + "TARGET_SSE2" > +{ > + if (!TARGET_SSSE3) > + { > + ix86_expand_sse2_abs (operands[0], operands[1]); > + DONE; > + } > +}) > + > (define_insn "abs<mode>2" > [(set (match_operand:MMXMODEI 0 "register_operand" "=y") > (abs:MMXMODEI > diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog > index 075d071..cf5b942 100644 > --- a/gcc/testsuite/ChangeLog > +++ b/gcc/testsuite/ChangeLog > @@ -1,3 +1,8 @@ > +2013-10-22 Cong Hou <co...@google.com> > + > + PR target/58762 > + * gcc.dg/vect/pr58762.c: New test. > + > 2013-10-14 Tobias Burnus <bur...@net-b.de> > > PR fortran/58658 > diff --git a/gcc/testsuite/gcc.dg/vect/pr58762.c > b/gcc/testsuite/gcc.dg/vect/pr58762.c > new file mode 100644 > index 0000000..6468d0a > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/pr58762.c > @@ -0,0 +1,28 @@ > +/* { dg-require-effective-target vect_int } */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -ftree-vectorize" } */ > + > +void test1 (char* a, char* b) > +{ > + int i; > + for (i = 0; i < 10000; ++i) > + a[i] = abs (b[i]); > +} > + > +void test2 (short* a, short* b) > +{ > + int i; > + for (i = 0; i < 10000; ++i) > + a[i] = abs (b[i]); > +} > + > +void test3 (int* a, int* b) > +{ > + int i; > + for (i = 0; i < 10000; ++i) > + a[i] = abs (b[i]); > +} > + > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 3 "vect" > + { target i?86-*-* x86_64-*-* } } } */ > +/* { dg-final { cleanup-tree-dump "vect" } } */ > > > > > > On Wed, Oct 30, 2013 at 2:31 AM, Uros Bizjak <ubiz...@gmail.com> wrote: >> On Tue, Oct 29, 2013 at 6:18 PM, Cong Hou <co...@google.com> wrote: >> >>>>> For the define_expand I added as below, the else body is there to >>>>> avoid fall-through transformations to ABS operation in optabs.c. >>>>> Otherwise ABS will be converted to other operations even that we have >>>>> corresponding instructions from SSSE3. >>>> >>>> No, it wont be. >>>> >>>> Fallthrough will generate the pattern that will be matched by the insn >>>> pattern above, just like you are doing by hand below. >>> >>> >>> I think the case is special for abs(). In optabs.c, there is a >>> function expand_abs() in which the function expand_abs_nojump() is >>> called. This function first tries the expand function defined for the >>> target and if it fails it will try max(v, -v) then shift-xor-sub >>> method. If I don't generate any instruction for SSSE3, the >>> fall-through will be max(v, -v). I have tested it on my machine. >> >> I have tested the usual approach in i386.md, shown exactly by the >> patch below (and using your other changes to i386.c): >> >> -- cut here -- >> Index: sse.md >> =================================================================== >> --- sse.md (revision 204149) >> +++ sse.md (working copy) >> @@ -10270,7 +10270,7 @@ >> (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p >> (insn)")) >> (set_attr "mode" "DI")]) >> >> -(define_insn "abs<mode>2" >> +(define_insn "*abs<mode>2" >> [(set (match_operand:VI124_AVX2_48_AVX512F 0 "register_operand" "=v") >> (abs:VI124_AVX2_48_AVX512F >> (match_operand:VI124_AVX2_48_AVX512F 1 "nonimmediate_operand" >> "vm")))] >> @@ -10282,6 +10282,19 @@ >> (set_attr "prefix" "maybe_vex") >> (set_attr "mode" "<sseinsnmode>")]) >> >> +(define_expand "abs<mode>2" >> + [(set (match_operand:VI124_AVX2_48_AVX512F 0 "register_operand") >> + (abs:VI124_AVX2_48_AVX512F >> + (match_operand:VI124_AVX2_48_AVX512F 1 "nonimmediate_operand")))] >> + "TARGET_SSE2" >> +{ >> + if (!TARGET_SSSE3) >> + { >> + ix86_expand_sse2_abs (operands[0], operands[1]); >> + DONE; >> + } >> +}) >> + >> (define_insn "abs<mode>2" >> [(set (match_operand:MMXMODEI 0 "register_operand" "=y") >> (abs:MMXMODEI >> -- cut here -- >> >> using following testcase: >> >> --cut here-- >> #define N 32 >> >> int ca[N]; >> int cb[N]; >> >> void test1 (void) >> { >> int i; >> for (i = 0; i < N; ++i) >> cb[i] = abs (ca[i]); >> } >> -- cut here -- >> >> Compiling on x86_64-pc-linux-gnu target (inherently -msse2): >> >> ~/gcc-build-fast/gcc/cc1 -O2 -ftree-vectorize -dp t.c >> >> .L2: >> movdqa ca(%rax), %xmm0 # 25 *movv4si_internal/2 [length = 8] >> addq $16, %rax # 30 *adddi_1/1 [length = 4] >> movdqa ca-16(%rax), %xmm1 # 46 *movv4si_internal/2 >> [length = 8] >> psrad $31, %xmm0 # 26 ashrv4si3/1 [length = 5] >> pxor %xmm0, %xmm1 # 47 *xorv4si3/1 [length = 4] >> psubd %xmm0, %xmm1 # 28 *subv4si3/1 [length = 4] >> movaps %xmm1, cb-16(%rax) # 29 *movv4si_internal/3 >> [length = 7] >> cmpq $128, %rax # 32 *cmpdi_1/1 [length = 6] >> jne .L2 # 33 *jcc_1 [length = 2] >> >> ~/gcc-build-fast/gcc/cc1 -O2 -ftree-vectorize -mssse3 -dp t.c >> >> .L2: >> pabsd ca(%rax), %xmm0 # 25 *absv4si2 [length = 9] >> addq $16, %rax # 27 *adddi_1/1 [length = 4] >> movaps %xmm0, cb-16(%rax) # 26 *movv4si_internal/3 >> [length = 7] >> cmpq $128, %rax # 29 *cmpdi_1/1 [length = 6] >> jne .L2 # 30 *jcc_1 [length = 2] >> >> As shown above, it worked OK for both with -mssse3 (generating pabsd >> insn) and without (generating your V4SI sequence). >> >> Uros.
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 8a38316..84c7ab5 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,10 @@ +2013-10-22 Cong Hou <co...@google.com> + + PR target/58762 + * config/i386/i386-protos.h (ix86_expand_sse2_abs): New function. + * config/i386/i386.c (ix86_expand_sse2_abs): New function. + * config/i386/sse.md: Add SSE2 support to abs (8/16/32-bit-int). + 2013-10-14 David Malcolm <dmalc...@redhat.com> * dumpfile.h (gcc::dump_manager): New class, to hold state diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 3ab2f3a..ca31224 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -238,6 +238,7 @@ extern void ix86_expand_mul_widen_evenodd (rtx, rtx, rtx, bool, bool); extern void ix86_expand_mul_widen_hilo (rtx, rtx, rtx, bool, bool); extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx); extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx); +extern void ix86_expand_sse2_abs (rtx, rtx); /* In i386-c.c */ extern void ix86_target_macros (void); diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 02cbbbd..71905fc 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -41696,6 +41696,53 @@ ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2) gen_rtx_MULT (mode, op1, op2)); } +void +ix86_expand_sse2_abs (rtx op0, rtx op1) +{ + enum machine_mode mode = GET_MODE (op0); + rtx tmp0, tmp1; + + switch (mode) + { + /* For 32-bit signed integer X, the best way to calculate the absolute + value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */ + case V4SImode: + tmp0 = expand_simple_binop (mode, ASHIFTRT, op1, + GEN_INT (GET_MODE_BITSIZE + (GET_MODE_INNER (mode)) - 1), + NULL, 0, OPTAB_DIRECT); + if (tmp0) + tmp1 = expand_simple_binop (mode, XOR, op1, tmp0, + NULL, 0, OPTAB_DIRECT); + if (tmp0 && tmp1) + expand_simple_binop (mode, MINUS, tmp1, tmp0, + op0, 0, OPTAB_DIRECT); + break; + + /* For 16-bit signed integer X, the best way to calculate the absolute + value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */ + case V8HImode: + tmp0 = expand_unop (mode, neg_optab, op1, NULL_RTX, 0); + if (tmp0) + expand_simple_binop (mode, SMAX, op1, tmp0, op0, 0, + OPTAB_DIRECT); + break; + + /* For 8-bit signed integer X, the best way to calculate the absolute + value of X is min ((unsigned char) X, (unsigned char) (-X)), + as SSE2 provides the PMINUB insn. */ + case V16QImode: + tmp0 = expand_unop (mode, neg_optab, op1, NULL_RTX, 0); + if (tmp0) + expand_simple_binop (V16QImode, UMIN, op1, tmp0, op0, 0, + OPTAB_DIRECT); + break; + + default: + break; + } +} + /* Expand an insert into a vector register through pinsr insn. Return true if successful. */ diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index c3f6c94..46e1df4 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -8721,7 +8721,7 @@ (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) (set_attr "mode" "DI")]) -(define_insn "abs<mode>2" +(define_insn "*abs<mode>2" [(set (match_operand:VI124_AVX2_48_AVX512F 0 "register_operand" "=v") (abs:VI124_AVX2_48_AVX512F (match_operand:VI124_AVX2_48_AVX512F 1 "nonimmediate_operand" "vm")))] @@ -8733,6 +8733,19 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "<sseinsnmode>")]) +(define_expand "abs<mode>2" + [(set (match_operand:VI124_AVX2_48_AVX512F 0 "register_operand") + (abs:VI124_AVX2_48_AVX512F + (match_operand:VI124_AVX2_48_AVX512F 1 "nonimmediate_operand")))] + "TARGET_SSE2" +{ + if (!TARGET_SSSE3) + { + ix86_expand_sse2_abs (operands[0], operands[1]); + DONE; + } +}) + (define_insn "abs<mode>2" [(set (match_operand:MMXMODEI 0 "register_operand" "=y") (abs:MMXMODEI diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 075d071..cf5b942 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2013-10-22 Cong Hou <co...@google.com> + + PR target/58762 + * gcc.dg/vect/pr58762.c: New test. + 2013-10-14 Tobias Burnus <bur...@net-b.de> PR fortran/58658 diff --git a/gcc/testsuite/gcc.dg/vect/pr58762.c b/gcc/testsuite/gcc.dg/vect/pr58762.c new file mode 100644 index 0000000..6468d0a --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/pr58762.c @@ -0,0 +1,28 @@ +/* { dg-require-effective-target vect_int } */ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize" } */ + +void test1 (char* a, char* b) +{ + int i; + for (i = 0; i < 10000; ++i) + a[i] = abs (b[i]); +} + +void test2 (short* a, short* b) +{ + int i; + for (i = 0; i < 10000; ++i) + a[i] = abs (b[i]); +} + +void test3 (int* a, int* b) +{ + int i; + for (i = 0; i < 10000; ++i) + a[i] = abs (b[i]); +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 3 "vect" + { target i?86-*-* x86_64-*-* } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */