Hi Richard, Thanks for the suggestions, updated.
Regards, Yuliang gcc/ChangeLog: 2019-10-17 Yuliang Wang <yuliang.w...@arm.com> * config/aarch64/aarch64-sve2.md (aarch64_sve2_eor3<mode>) (aarch64_sve2_nor<mode>, aarch64_sve2_nand<mode>) (aarch64_sve2_bsl<mode>, aarch64_sve2_nbsl<mode>) (aarch64_sve2_bsl1n<mode>, aarch64_sve2_bsl2n<mode>): New combine patterns. * config/aarch64/iterators.md (BSL_DUP): New int iterator for the above. (bsl_1st, bsl_2nd, bsl_dup, bsl_mov): Attributes for the above. * config/aarch64/aarch64.h (AARCH64_ISA_SVE2_SHA3): New ISA flag macro. (TARGET_SVE2_SHA3): New CPU target. gcc/testsuite/ChangeLog: 2019-10-17 Yuliang Wang <yuliang.w...@arm.com> * gcc.target/aarch64/sve2/eor3_1.c: New test. * gcc.target/aarch64/sve2/eor3_2.c: As above. * gcc.target/aarch64/sve2/nlogic_1.c: As above. * gcc.target/aarch64/sve2/nlogic_2.c: As above. * gcc.target/aarch64/sve2/bitsel_1.c: As above. * gcc.target/aarch64/sve2/bitsel_2.c: As above. * gcc.target/aarch64/sve2/bitsel_3.c: As above. * gcc.target/aarch64/sve2/bitsel_4.c: As above. diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md index b018f5b0bc9b51edf831e2571f0f5a9af2210829..08d5214a3debb9e9a0796da0af3009ed3ff55774 100644 --- a/gcc/config/aarch64/aarch64-sve2.md +++ b/gcc/config/aarch64/aarch64-sve2.md @@ -142,3 +142,189 @@ } ) +;; Unpredicated 3-way exclusive OR. +(define_insn "*aarch64_sve2_eor3<mode>" + [(set (match_operand:SVE_I 0 "register_operand" "=w, w, w, ?&w") + (xor:SVE_I + (xor:SVE_I + (match_operand:SVE_I 1 "register_operand" "0, w, w, w") + (match_operand:SVE_I 2 "register_operand" "w, 0, w, w")) + (match_operand:SVE_I 3 "register_operand" "w, w, 0, w")))] + "TARGET_SVE2_SHA3" + "@ + eor3\t%0.d, %0.d, %2.d, %3.d + eor3\t%0.d, %0.d, %1.d, %3.d + eor3\t%0.d, %0.d, %1.d, %2.d + movprfx\t%0, %1\;eor3\t%0.d, %0.d, %2.d, %3.d" + [(set_attr "movprfx" "*,*,*,yes")] +) + +;; Use NBSL for vector NOR. +(define_insn_and_rewrite "*aarch64_sve2_nor<mode>" + [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w") + (unspec:SVE_I + [(match_operand 3) + (and:SVE_I + (not:SVE_I + (match_operand:SVE_I 1 "register_operand" "%0, w")) + (not:SVE_I + (match_operand:SVE_I 2 "register_operand" "w, w")))] + UNSPEC_PRED_X))] + "TARGET_SVE2" + "@ + nbsl\t%0.d, %0.d, %2.d, %0.d + movprfx\t%0, %1\;nbsl\t%0.d, %0.d, %2.d, %0.d" + "&& !CONSTANT_P (operands[3])" + { + operands[3] = CONSTM1_RTX (<VPRED>mode); + } + [(set_attr "movprfx" "*,yes")] +) + +;; Use NBSL for vector NAND. +(define_insn_and_rewrite "*aarch64_sve2_nand<mode>" + [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w") + (unspec:SVE_I + [(match_operand 3) + (ior:SVE_I + (not:SVE_I + (match_operand:SVE_I 1 "register_operand" "%0, w")) + (not:SVE_I + (match_operand:SVE_I 2 "register_operand" "w, w")))] + UNSPEC_PRED_X))] + "TARGET_SVE2" + "@ + nbsl\t%0.d, %0.d, %2.d, %2.d + movprfx\t%0, %1\;nbsl\t%0.d, %0.d, %2.d, %2.d" + "&& !CONSTANT_P (operands[3])" + { + operands[3] = CONSTM1_RTX (<VPRED>mode); + } + [(set_attr "movprfx" "*,yes")] +) + +;; Unpredicated bitwise select. +;; N.B. non-canonical equivalent form due to expand pass. +;; (op3 ? bsl_mov : bsl_dup) == (((bsl_mov ^ bsl_dup) & op3) ^ bsl_dup) +(define_insn "*aarch64_sve2_bsl<mode>" + [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w") + (xor:SVE_I + (and:SVE_I + (xor:SVE_I + (match_operand:SVE_I 1 "register_operand" "<bsl_1st>, w") + (match_operand:SVE_I 2 "register_operand" "<bsl_2nd>, w")) + (match_operand:SVE_I 3 "register_operand" "w, w")) + (match_dup BSL_DUP)))] + "TARGET_SVE2" + "@ + bsl\t%0.d, %0.d, %<bsl_dup>.d, %3.d + movprfx\t%0, %<bsl_mov>\;bsl\t%0.d, %0.d, %<bsl_dup>.d, %3.d" + [(set_attr "movprfx" "*,yes")] +) + +;; Unpredicated bitwise inverted select. +;; N.B. non-canonical equivalent form. +;; (~(op3 ? bsl_mov : bsl_dup)) == (~(((bsl_mov ^ bsl_dup) & op3) ^ bsl_dup)) +(define_insn_and_rewrite "*aarch64_sve2_nbsl<mode>" + [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w") + (unspec:SVE_I + [(match_operand 4) + (not:SVE_I + (xor:SVE_I + (and:SVE_I + (xor:SVE_I + (match_operand:SVE_I 1 "register_operand" "<bsl_1st>, w") + (match_operand:SVE_I 2 "register_operand" "<bsl_2nd>, w")) + (match_operand:SVE_I 3 "register_operand" "w, w")) + (match_dup BSL_DUP)))] + UNSPEC_PRED_X))] + "TARGET_SVE2" + "@ + nbsl\t%0.d, %0.d, %<bsl_dup>.d, %3.d + movprfx\t%0, %<bsl_mov>\;nbsl\t%0.d, %0.d, %<bsl_dup>.d, %3.d" + "&& !CONSTANT_P (operands[4])" + { + operands[4] = CONSTM1_RTX (<VPRED>mode); + } + [(set_attr "movprfx" "*,yes")] +) + +;; Unpredicated bitwise select with inverted first operand. +;; N.B. non-canonical equivalent form. +;; (op3 ? ~bsl_mov : bsl_dup) == (((~bsl_mov ^ bsl_dup) & op3) ^ bsl_dup) +(define_insn_and_rewrite "*aarch64_sve2_bsl1n<mode>" + [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w") + (xor:SVE_I + (and:SVE_I + (unspec:SVE_I + [(match_operand 4) + (not:SVE_I + (xor:SVE_I + (match_operand:SVE_I 1 "register_operand" "<bsl_1st>, w") + (match_operand:SVE_I 2 "register_operand" "<bsl_2nd>, w")))] + UNSPEC_PRED_X) + (match_operand:SVE_I 3 "register_operand" "w, w")) + (match_dup BSL_DUP)))] + "TARGET_SVE2" + "@ + bsl1n\t%0.d, %0.d, %<bsl_dup>.d, %3.d + movprfx\t%0, %<bsl_mov>\;bsl1n\t%0.d, %0.d, %<bsl_dup>.d, %3.d" + "&& !CONSTANT_P (operands[4])" + { + operands[4] = CONSTM1_RTX (<VPRED>mode); + } + [(set_attr "movprfx" "*,yes")] +) + +;; Unpredicated bitwise select with inverted second operand. +(define_insn_and_rewrite "*aarch64_sve2_bsl2n<mode>" + [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w") + (ior:SVE_I + (and:SVE_I + (match_operand:SVE_I 1 "register_operand" "<bsl_1st>, w") + (match_operand:SVE_I 2 "register_operand" "<bsl_2nd>, w")) + (unspec:SVE_I + [(match_operand 4) + (and:SVE_I + (not:SVE_I + (match_operand:SVE_I 3 "register_operand" "w, w")) + (not:SVE_I + (match_dup BSL_DUP)))] + UNSPEC_PRED_X)))] + "TARGET_SVE2" + "@ + bsl2n\t%0.d, %0.d, %3.d, %<bsl_dup>.d + movprfx\t%0, %<bsl_mov>\;bsl2n\t%0.d, %0.d, %3.d, %<bsl_dup>.d" + "&& !CONSTANT_P (operands[4])" + { + operands[4] = CONSTM1_RTX (<VPRED>mode); + } + [(set_attr "movprfx" "*,yes")] +) + +;; Unpredicated bitwise select with inverted second operand, alternative form. +(define_insn_and_rewrite "*aarch64_sve2_bsl2n<mode>" + [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w") + (ior:SVE_I + (and:SVE_I + (match_operand:SVE_I 1 "register_operand" "<bsl_1st>, w") + (match_operand:SVE_I 2 "register_operand" "<bsl_2nd>, w")) + (unspec:SVE_I + [(match_operand 4) + (and:SVE_I + (not:SVE_I + (match_dup BSL_DUP)) + (not:SVE_I + (match_operand:SVE_I 3 "register_operand" "w, w")))] + UNSPEC_PRED_X)))] + "TARGET_SVE2" + "@ + bsl2n\t%0.d, %0.d, %3.d, %<bsl_dup>.d + movprfx\t%0, %<bsl_mov>\;bsl2n\t%0.d, %0.d, %3.d, %<bsl_dup>.d" + "&& !CONSTANT_P (operands[4])" + { + operands[4] = CONSTM1_RTX (<VPRED>mode); + } + [(set_attr "movprfx" "*,yes")] +) + diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index abd14a2f92c06828adfc6d2e2e81b63a6163d3a3..cad401ceb2419b6a0a64f2396c8e7d5b9105fb22 100644 --- a/gcc/config/aarch64/aarch64.h +++ b/gcc/config/aarch64/aarch64.h @@ -236,6 +236,7 @@ extern unsigned aarch64_architecture_version; #define AARCH64_ISA_F16 (aarch64_isa_flags & AARCH64_FL_F16) #define AARCH64_ISA_SVE (aarch64_isa_flags & AARCH64_FL_SVE) #define AARCH64_ISA_SVE2 (aarch64_isa_flags & AARCH64_FL_SVE2) +#define AARCH64_ISA_SVE2_SHA3 (aarch64_isa_flags & AARCH64_FL_SVE2_SHA3) #define AARCH64_ISA_V8_3 (aarch64_isa_flags & AARCH64_FL_V8_3) #define AARCH64_ISA_DOTPROD (aarch64_isa_flags & AARCH64_FL_DOTPROD) #define AARCH64_ISA_AES (aarch64_isa_flags & AARCH64_FL_AES) @@ -285,6 +286,9 @@ extern unsigned aarch64_architecture_version; /* SVE2 instructions, enabled through +sve2. */ #define TARGET_SVE2 (AARCH64_ISA_SVE2) +/* SVE2 SHA3 instructions, enabled through +sve2-sha3. */ +#define TARGET_SVE2_SHA3 (TARGET_SVE2 && AARCH64_ISA_SVE2_SHA3) + /* ARMv8.3-A features. */ #define TARGET_ARMV8_3 (AARCH64_ISA_V8_3) diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 1e321af710bfe80606eedee7e0d191f36c70355b..f879fadb007a23749a523edbe7fe247dee33fa94 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -1611,6 +1611,8 @@ (define_int_iterator SHRNT [UNSPEC_SHRNT UNSPEC_RSHRNT]) +(define_int_iterator BSL_DUP [1 2]) + (define_int_iterator DOTPROD [UNSPEC_SDOT UNSPEC_UDOT]) (define_int_iterator ADDSUBHN [UNSPEC_ADDHN UNSPEC_RADDHN @@ -1976,6 +1978,18 @@ (UNSPEC_RADDHN2 "add") (UNSPEC_RSUBHN2 "sub")]) +;; BSL variants: first commutative operand. +(define_int_attr bsl_1st [(1 "w") (2 "0")]) + +;; BSL variants: second commutative operand. +(define_int_attr bsl_2nd [(1 "0") (2 "w")]) + +;; BSL variants: duplicated input operand. +(define_int_attr bsl_dup [(1 "1") (2 "2")]) + +;; BSL variants: operand which requires preserving via movprfx. +(define_int_attr bsl_mov [(1 "2") (2 "1")]) + (define_int_attr offsetlr [(UNSPEC_SSLI "") (UNSPEC_USLI "") (UNSPEC_SSRI "offset_") (UNSPEC_USRI "offset_")]) diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_1.c b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_1.c new file mode 100644 index 0000000000000000000000000000000000000000..5c58ff54231d88a4ebf0a91fe4fac97079c8d992 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_1.c @@ -0,0 +1,33 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details --save-temps" } */ + +#include <stdint.h> + +#ifndef OP +#define OP(x,y,z) (((x) & (z)) | ((y) & ~(z))) +#endif + +#define TYPE(N) int##N##_t + +#define TEMPLATE(SIZE) \ +void __attribute__ ((noinline, noclone)) \ +f_##SIZE##_##OP \ + (TYPE(SIZE) *restrict a, TYPE(SIZE) *restrict b, \ + TYPE(SIZE) *restrict c, TYPE(SIZE) *restrict d, int n) \ +{ \ + for (int i = 0; i < n; i++) \ + a[i] = OP (b[i], c[i], d[i]); \ +} + +TEMPLATE (8); +TEMPLATE (16); +TEMPLATE (32); +TEMPLATE (64); + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */ + +/* { dg-final { scan-assembler-not {\teor\tz[0-9]+\.[bhsd]} } } */ +/* { dg-final { scan-assembler-not {\tand\tz[0-9]+\.[bhsd]} } } */ + +/* { dg-final { scan-assembler-times {\tbsl\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */ + diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_2.c b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_2.c new file mode 100644 index 0000000000000000000000000000000000000000..ac0d27213e84bb5c7f3d236f3cac59c71ac674ed --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_2.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details --save-temps" } */ + +#define OP(x,y,z) (~(((x) & (z)) | ((y) & ~(z)))) + +#include "bitsel_1.c" + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */ + +/* { dg-final { scan-assembler-not {\teor\tz[0-9]+\.[bhsd]} } } */ +/* { dg-final { scan-assembler-not {\tand\tz[0-9]+\.[bhsd]} } } */ +/* { dg-final { scan-assembler-not {\tnot\tz[0-9]+\.[bhsd]} } } */ + +/* { dg-final { scan-assembler-times {\tnbsl\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */ + diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_3.c b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_3.c new file mode 100644 index 0000000000000000000000000000000000000000..93995bb8bade89cd821ed85153d13e96bd4422a5 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_3.c @@ -0,0 +1,14 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details --save-temps" } */ + +#define OP(x,y,z) ((~(x) & (z)) | ((y) & ~(z))) + +#include "bitsel_1.c" + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */ + +/* { dg-final { scan-assembler-not {\teor\tz[0-9]+\.[bhsd]} } } */ +/* { dg-final { scan-assembler-not {\tbic\tz[0-9]+\.[bhsd]} } } */ + +/* { dg-final { scan-assembler-times {\tbsl1n\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */ + diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_4.c b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_4.c new file mode 100644 index 0000000000000000000000000000000000000000..7ccec619b4d1e8de366c0b0c53879a89a00c2c49 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_4.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details --save-temps" } */ + +#define OP(x,y,z) (((x) & (z)) | (~(y) & ~(z))) + +#include "bitsel_1.c" + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */ + +/* { dg-final { scan-assembler-not {\torr\tz[0-9]+\.[bhsd]} } } */ +/* { dg-final { scan-assembler-not {\tand\tz[0-9]+\.[bhsd]} } } */ +/* { dg-final { scan-assembler-not {\tnot\tz[0-9]+\.[bhsd]} } } */ + +/* { dg-final { scan-assembler-times {\tbsl2n\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */ + diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/eor3_1.c b/gcc/testsuite/gcc.target/aarch64/sve2/eor3_1.c new file mode 100644 index 0000000000000000000000000000000000000000..8eb7152f2cf43328b9ddf43688be1bea63447308 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve2/eor3_1.c @@ -0,0 +1,13 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -march=armv8.5-a+sve2-sha3 -fdump-tree-vect-details --save-temps" } */ + +#define OP(x,y,z) ((x) ^ (y) ^ (z)) + +#include "bitsel_1.c" + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */ + +/* { dg-final { scan-assembler-not {\teor\tz[0-9]+\.[bhsd]} } } */ + +/* { dg-final { scan-assembler-times {\teor3\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */ + diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/eor3_2.c b/gcc/testsuite/gcc.target/aarch64/sve2/eor3_2.c new file mode 100644 index 0000000000000000000000000000000000000000..0bb87a668c6ab2917ce6746f7a8bf242781d0c35 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve2/eor3_2.c @@ -0,0 +1,13 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details --save-temps" } */ + +#define OP(x,y,z) ((x) ^ (y) ^ (z)) + +#include "bitsel_1.c" + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */ + +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.[bhsd], z[0-9]+\.[bhsd], z[0-9]+\.[bhsd]} 8 } } */ + +/* { dg-final { scan-assembler-not {\teor3\tz[0-9]+\.d\n} } } */ + diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/nlogic_1.c b/gcc/testsuite/gcc.target/aarch64/sve2/nlogic_1.c new file mode 100644 index 0000000000000000000000000000000000000000..ef0e266bd93bb3d3b5af204438ad8ef35faa5675 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve2/nlogic_1.c @@ -0,0 +1,34 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details --save-temps" } */ + +#include <stdint.h> + +#ifndef OP +#define OP(x,y) (~((x) | (y))) +#endif + +#define TYPE(N) int##N##_t + +#define TEMPLATE(SIZE) \ +void __attribute__ ((noinline, noclone)) \ +f_##SIZE##_##OP \ + (TYPE(SIZE) *restrict a, TYPE(SIZE) *restrict b, \ + TYPE(SIZE) *restrict c, int n) \ +{ \ + for (int i = 0; i < n; i++) \ + a[i] = OP (b[i], c[i]); \ +} + +TEMPLATE (8); +TEMPLATE (16); +TEMPLATE (32); +TEMPLATE (64); + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */ + +/* { dg-final { scan-assembler-not {\tand\tz[0-9]+\.[bhsd]} } } */ +/* { dg-final { scan-assembler-not {\torr\tz[0-9]+\.[bhsd]} } } */ +/* { dg-final { scan-assembler-not {\tnot\tz[0-9]+\.[bhsd]} } } */ + +/* { dg-final { scan-assembler-times {\tnbsl\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */ + diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/nlogic_2.c b/gcc/testsuite/gcc.target/aarch64/sve2/nlogic_2.c new file mode 100644 index 0000000000000000000000000000000000000000..da8c86161625ff51814c0d8d4e5d51035ad1b1f6 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve2/nlogic_2.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details --save-temps" } */ + +#define OP(x,y) (~((x) & (y))) + +#include "nlogic_1.c" + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */ + +/* { dg-final { scan-assembler-not {\tand\tz[0-9]+\.[bhsd]} } } */ +/* { dg-final { scan-assembler-not {\torr\tz[0-9]+\.[bhsd]} } } */ +/* { dg-final { scan-assembler-not {\tnot\tz[0-9]+\.[bhsd]} } } */ + +/* { dg-final { scan-assembler-times {\tnbsl\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */ + -----Original Message----- From: Richard Sandiford <richard.sandif...@arm.com> Sent: 16 October 2019 20:48 To: Yuliang Wang <yuliang.w...@arm.com> Cc: gcc-patches@gcc.gnu.org; nd <n...@arm.com> Subject: Re: [AArch64][SVE2] Support for EOR3 and variants of BSL Thanks for the patch, looks really good. Yuliang Wang <yuliang.w...@arm.com> writes: > +;; Use NBSL for vector NOR. > +(define_insn_and_rewrite "*aarch64_sve2_nor<mode>" > + [(set (match_operand:SVE_I 0 "register_operand" "=w, w, ?&w") > + (unspec:SVE_I > + [(match_operand 3) > + (and:SVE_I > + (not:SVE_I > + (match_operand:SVE_I 1 "register_operand" "w, 0, w")) > + (not:SVE_I > + (match_operand:SVE_I 2 "register_operand" "0, w, w")))] > + UNSPEC_PRED_X))] > + "TARGET_SVE2" > + "@ > + nbsl\t%0.d, %0.d, %1.d, %0.d > + nbsl\t%0.d, %0.d, %2.d, %0.d > + movprfx\t%0, %2\;nbsl\t%0.d, %0.d, %1.d, %0.d" > + "&& !CONSTANT_P (operands[3])" > + { > + operands[3] = CONSTM1_RTX (<VPRED>mode); > + } > + [(set_attr "movprfx" "*,*,yes")] > +) > + > +;; Use NBSL for vector NAND. > +(define_insn_and_rewrite "*aarch64_sve2_nand<mode>" > + [(set (match_operand:SVE_I 0 "register_operand" "=w, w, ?&w") > + (unspec:SVE_I > + [(match_operand 3) > + (ior:SVE_I > + (not:SVE_I > + (match_operand:SVE_I 1 "register_operand" "w, 0, w")) > + (not:SVE_I > + (match_operand:SVE_I 2 "register_operand" "0, w, w")))] > + UNSPEC_PRED_X))] > + "TARGET_SVE2" > + "@ > + nbsl\t%0.d, %0.d, %1.d, %1.d > + nbsl\t%0.d, %0.d, %2.d, %2.d > + movprfx\t%0, %2\;nbsl\t%0.d, %0.d, %1.d, %1.d" > + "&& !CONSTANT_P (operands[3])" > + { > + operands[3] = CONSTM1_RTX (<VPRED>mode); > + } > + [(set_attr "movprfx" "*,*,yes")] > +) For these two it should in theory be slightly better to use "%" on operand 1 to make the operation commutative, rather than provide a separate matching alternative for operand 2. E.g.: (define_insn_and_rewrite "*aarch64_sve2_nand<mode>" [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w") (unspec:SVE_I [(match_operand 3) (ior:SVE_I (not:SVE_I (match_operand:SVE_I 1 "register_operand" "%0, w")) (not:SVE_I (match_operand:SVE_I 2 "register_operand" "w, w")))] UNSPEC_PRED_X))] "TARGET_SVE2" "@ nbsl\t%0.d, %0.d, %2.d, %2.d movprfx\t%0, %1\;nbsl\t%0.d, %0.d, %2.d, %2.d" "&& !CONSTANT_P (operands[3])" { operands[3] = CONSTM1_RTX (<VPRED>mode); } [(set_attr "movprfx" "*,*,yes")] ) (But the EOR3 pattern is fine as-is, since that supports any permutation of the three inputs.) > +;; Unpredicated bitwise select. > +(define_insn "*aarch64_sve2_bsl<mode>" > + [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w") > + (xor:SVE_I > + (and:SVE_I > + (xor:SVE_I > + (match_operand:SVE_I 1 "register_operand" "<bsl_1st>, w") > + (match_operand:SVE_I 2 "register_operand" "<bsl_2nd>, w")) > + (match_operand:SVE_I 3 "register_operand" "w, w")) > + (match_dup BSL_3RD)))] > + "TARGET_SVE2" > + "@ > + bsl\t%0.d, %0.d, %<bsl_3rd>.d, %3.d > + movprfx\t%0, %<bsl_mov>\;bsl\t%0.d, %0.d, %<bsl_3rd>.d, %3.d" > + [(set_attr "movprfx" "*,yes")] > +) This is sufficiently far from the documented logic that it might be worth a comment. E.g.: ;; (op3 ? bsl_mov : bsl_3rd) == (((bsl_mov ^ bsl_3rd) & op3) ^ bsl_3rd) Similarly for the later patterns. > +;; Unpredicated bitwise inverted select. > +(define_insn_and_rewrite "*aarch64_sve2_nbsl<mode>" > + [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w") > + (unspec:SVE_I > + [(match_operand 4) > + (not:SVE_I > + (xor:SVE_I > + (and:SVE_I > + (xor:SVE_I > + (match_operand:SVE_I 1 "register_operand" "<bsl_1st>, w") > + (match_operand:SVE_I 2 "register_operand" "<bsl_2nd>, w")) > + (match_operand:SVE_I 3 "register_operand" "w, w")) > + (match_dup BSL_3RD)))] > + UNSPEC_PRED_X))] > + "TARGET_SVE2" > + "@ > + nbsl\t%0.d, %0.d, %<bsl_3rd>.d, %3.d > + movprfx\t%0, %<bsl_mov>\;nbsl\t%0.d, %0.d, %<bsl_3rd>.d, %3.d" > + "&& !CONSTANT_P (operands[4])" > + { > + operands[4] = CONSTM1_RTX (<VPRED>mode); > + } > + [(set_attr "movprfx" "*,yes")] > +) > + > +;; Unpredicated bitwise select with inverted first operand. > +(define_insn_and_rewrite "*aarch64_sve2_bsl1n<mode>" > + [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w") > + (xor:SVE_I > + (and:SVE_I > + (unspec:SVE_I > + [(match_operand 4) > + (not:SVE_I > + (xor:SVE_I > + (match_operand:SVE_I 1 "register_operand" "<bsl_1st>, w") > + (match_operand:SVE_I 2 "register_operand" "<bsl_2nd>, w")))] > + UNSPEC_PRED_X) > + (match_operand:SVE_I 3 "register_operand" "w, w")) > + (match_dup BSL_3RD)))] > + "TARGET_SVE2" > + "@ > + bsl1n\t%0.d, %0.d, %<bsl_3rd>.d, %3.d > + movprfx\t%0, %<bsl_mov>\;bsl1n\t%0.d, %0.d, %<bsl_3rd>.d, %3.d" > + "&& !CONSTANT_P (operands[4])" > + { > + operands[4] = CONSTM1_RTX (<VPRED>mode); > + } > + [(set_attr "movprfx" "*,yes")] > +) > + > +;; Unpredicated bitwise select with inverted second operand. > +(define_insn_and_rewrite "*aarch64_sve2_bsl2n<mode>" > + [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w") > + (ior:SVE_I > + (and:SVE_I > + (match_operand:SVE_I 1 "register_operand" "<bsl_1st>, w") > + (match_operand:SVE_I 2 "register_operand" "<bsl_2nd>, w")) > + (unspec:SVE_I > + [(match_operand 4) > + (and:SVE_I > + (not:SVE_I > + (match_operand:SVE_I 3 "register_operand" "w, w")) > + (not:SVE_I > + (match_dup BSL_3RD)))] > + UNSPEC_PRED_X)))] > + "TARGET_SVE2" > + "@ > + bsl2n\t%0.d, %0.d, %3.d, %<bsl_3rd>.d > + movprfx\t%0, %<bsl_mov>\;bsl2n\t%0.d, %0.d, %3.d, %<bsl_3rd>.d" > + "&& !CONSTANT_P (operands[4])" > + { > + operands[4] = CONSTM1_RTX (<VPRED>mode); > + } > + [(set_attr "movprfx" "*,yes")] > +) > + > +;; Unpredicated bitwise select with inverted second operand, alternative > form. > +(define_insn_and_rewrite "*aarch64_sve2_bsl2n<mode>" > + [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w") > + (ior:SVE_I > + (and:SVE_I > + (match_operand:SVE_I 1 "register_operand" "<bsl_1st>, w") > + (match_operand:SVE_I 2 "register_operand" "<bsl_2nd>, w")) > + (unspec:SVE_I > + [(match_operand 4) > + (and:SVE_I > + (not:SVE_I > + (match_dup BSL_3RD)) > + (not:SVE_I > + (match_operand:SVE_I 3 "register_operand" "w, w")))] > + UNSPEC_PRED_X)))] > + "TARGET_SVE2" > + "@ > + bsl2n\t%0.d, %0.d, %3.d, %<bsl_3rd>.d > + movprfx\t%0, %<bsl_mov>\;bsl2n\t%0.d, %0.d, %3.d, %<bsl_3rd>.d" > + "&& !CONSTANT_P (operands[4])" > + { > + operands[4] = CONSTM1_RTX (<VPRED>mode); > + } > + [(set_attr "movprfx" "*,yes")] > +) bsl_3rd might be a confusing name given the above, where it ends up being the 4th operand. How about renaming it to bsl_dup instead? (Same for the iterator.) Maybe bsl_tied instead of bsl_mov too. > diff --git a/gcc/config/aarch64/aarch64.h > b/gcc/config/aarch64/aarch64.h index > abd14a2f92c06828adfc6d2e2e81b63a6163d3a3..a18565f5c2046c207a281522917e > b6e2128aefbf 100644 > --- a/gcc/config/aarch64/aarch64.h > +++ b/gcc/config/aarch64/aarch64.h > @@ -236,6 +236,10 @@ extern unsigned aarch64_architecture_version; > #define AARCH64_ISA_F16 (aarch64_isa_flags & AARCH64_FL_F16) > #define AARCH64_ISA_SVE (aarch64_isa_flags & AARCH64_FL_SVE) > #define AARCH64_ISA_SVE2 (aarch64_isa_flags & AARCH64_FL_SVE2) > +#define AARCH64_ISA_SVE2_AES (aarch64_isa_flags & AARCH64_FL_SVE2_AES) > +#define AARCH64_ISA_SVE2_SM4 (aarch64_isa_flags & AARCH64_FL_SVE2_SM4) > +#define AARCH64_ISA_SVE2_SHA3 (aarch64_isa_flags & > AARCH64_FL_SVE2_SHA3) > +#define AARCH64_ISA_SVE2_BITPERM (aarch64_isa_flags & > AARCH64_FL_SVE2_BITPERM) > #define AARCH64_ISA_V8_3 (aarch64_isa_flags & AARCH64_FL_V8_3) > #define AARCH64_ISA_DOTPROD (aarch64_isa_flags & AARCH64_FL_DOTPROD) > #define AARCH64_ISA_AES (aarch64_isa_flags & AARCH64_FL_AES) > @@ -285,6 +289,18 @@ extern unsigned aarch64_architecture_version; > /* SVE2 instructions, enabled through +sve2. */ #define TARGET_SVE2 > (AARCH64_ISA_SVE2) > > +/* SVE2 AES instructions, enabled through +sve2-aes. */ #define > +TARGET_SVE2_AES (TARGET_SVE2 && AARCH64_ISA_SVE2_AES) > + > +/* SVE2 SM4 instructions, enabled through +sve2-sm4. */ #define > +TARGET_SVE2_SM4 (TARGET_SVE2 && AARCH64_ISA_SVE2_SM4) > + > +/* SVE2 SHA3 instructions, enabled through +sve2-sha3. */ #define > +TARGET_SVE2_SHA3 (TARGET_SVE2 && AARCH64_ISA_SVE2_SHA3) > + > +/* SVE2 bitperm instructions, enabled through +sve2-bitperm. */ > +#define TARGET_SVE2_BITPERM (TARGET_SVE2 && AARCH64_ISA_SVE2_BITPERM) > + > /* ARMv8.3-A features. */ > #define TARGET_ARMV8_3 (AARCH64_ISA_V8_3) These two hunks aren't needed for the patch. I think it would be better to leave them undefined until we need them. > diff --git a/gcc/config/aarch64/iterators.md > b/gcc/config/aarch64/iterators.md index > 1e321af710bfe80606eedee7e0d191f36c70355b..f6c74cce981aee6fb42704570ad6 > a00537f60d6e 100644 > --- a/gcc/config/aarch64/iterators.md > +++ b/gcc/config/aarch64/iterators.md > @@ -1611,6 +1611,8 @@ > > (define_int_iterator SHRNT [UNSPEC_SHRNT UNSPEC_RSHRNT]) > > +(define_int_iterator BSL_3RD [1 2]) > + > (define_int_iterator DOTPROD [UNSPEC_SDOT UNSPEC_UDOT]) > > (define_int_iterator ADDSUBHN [UNSPEC_ADDHN UNSPEC_RADDHN @@ -1976,6 > +1978,18 @@ > (UNSPEC_RADDHN2 "add") > (UNSPEC_RSUBHN2 "sub")]) > > +;; BSL variants: first commutative operand. > +(define_int_attr bsl_1st [(1 "w") (2 "0")]) > + > +;; BSL variants: second commutative operand. > +(define_int_attr bsl_2nd [(1 "0") (2 "w")]) > + > +;; BSL variants: third input operand. > +(define_int_attr bsl_3rd [(1 "1") (2 "2")]) Just to show I thought about it :-) this isn't really necessary, since <BSL_3RD> should work too. But I agree it looks nicer to have everything in the same case, so I think it's better to have the attribute regardless. > + > +;; BSL variants: operand which requires preserving via movprfx. > +(define_int_attr bsl_mov [(1 "2") (2 "1")]) > + > (define_int_attr offsetlr [(UNSPEC_SSLI "") (UNSPEC_USLI "") > (UNSPEC_SSRI "offset_") > (UNSPEC_USRI "offset_")]) > diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_1.c > b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_1.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..041552f5081238197c455fb667ab > d19e3c74c4a8 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve2/bitsel_1.c > @@ -0,0 +1,33 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details > +--save-temps" } */ > + > +#include <stdint.h> > + > +#ifndef OP > +#define OP(x,y,z) (((x) & (z)) | ((y) & ~(z))) #endif > + > +#define TYPE(N) int##N##_t > + > +#define TEMPLATE(SIZE) \ > +void __attribute__ ((noinline, noclone)) \ > +f_##SIZE##_##OP \ > + (TYPE(SIZE) *restrict a, TYPE(SIZE) *restrict b, \ > + TYPE(SIZE) *restrict c, TYPE(SIZE) *restrict d, int n) \ > +{ \ > + for (int i = 0; i < n; i++) \ > + a[i] = OP (b[i], c[i], d[i]); \ > +} > + > +TEMPLATE (8); > +TEMPLATE (16); > +TEMPLATE (32); > +TEMPLATE (64); > + > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" > +4 "vect" } } */ > + > +/* { dg-final { scan-assembler-not {\teor\tz[0-9]+\.d} } } */ > +/* { dg-final { scan-assembler-not {\tand\tz[0-9]+\.d} } } */ For these I think: /* { dg-final { scan-assembler-not {\teor\tz} } } */ /* { dg-final { scan-assembler-not {\tand\tz} } } */ would be better, since it would be possible to use EOR or AND with zN.b rather than zN.d. Same for the other files. Thanks, Richard