Hi, Add support for the vmsumudm instruction and tie it into the vec_msum built-in to support the variants of that built-in using vector _int128 parameters.
vector _uint128_t vec_msum (vector unsigned long long, vector unsigned long long, vector _uint128_t); vector _int128_t vec_msum (vector signed long long, vector signed long long, vector _int128_t); [v2] Corrected the define_insn and test requirements to be limited to P9. Improve description to clarify the vmsum does a widening multiply and horizontal addition. Fresh regtests currently running on assorted powerpc targets. OK for trunk? Thanks, -Will [gcc] 2020-06-18 Will Schmidt <will_schm...@vnet.ibm.com> * config/rs6000/altivec.h (vec_vmsumudm): New define. * config/rs6000/altivec.md (UNSPEC_VMSUMUDM): New unspec. (altivec_vmsumudm): New define_insn. * config/rs6000/rs6000-builtin.def (altivec_vmsumudm): New BU_ALTIVEC_3 entry. (vmsumudm): New BU_ALTIVEC_OVERLOAD_3 entry. * config/rs6000/rs6000-call.c (altivec_overloaded_builtins): Add entries for ALTIVEC_BUILTIN_VMSUMUDM variants of vec_msum. [testsuite] 2020-06-18 Will Schmidt <will_schm...@vnet.ibm.com> * gcc.target/powerpc/builtins-msum-runnable.c: New test. * gcc.target/powerpc/vsx-builtin-msum.c: New test. diff --git a/gcc/config/rs6000/altivec.h b/gcc/config/rs6000/altivec.h index bb1524f4a679..0d199393556d 100644 --- a/gcc/config/rs6000/altivec.h +++ b/gcc/config/rs6000/altivec.h @@ -159,10 +159,11 @@ #define vec_vmsumuhm __builtin_vec_vmsumuhm #define vec_vmsummbm __builtin_vec_vmsummbm #define vec_vmsumubm __builtin_vec_vmsumubm #define vec_vmsumshs __builtin_vec_vmsumshs #define vec_vmsumuhs __builtin_vec_vmsumuhs +#define vec_vmsumudm __builtin_vec_vmsumudm #define vec_vmulesb __builtin_vec_vmulesb #define vec_vmulesh __builtin_vec_vmulesh #define vec_vmuleuh __builtin_vec_vmuleuh #define vec_vmuleub __builtin_vec_vmuleub #define vec_vmulosh __builtin_vec_vmulosh diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index 2ce9227c765a..84fee916d041 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -19,10 +19,11 @@ ;; <http://www.gnu.org/licenses/>. (define_c_enum "unspec" [UNSPEC_VCMPBFP UNSPEC_VMSUMU + UNSPEC_VMSUMUDM UNSPEC_VMSUMM UNSPEC_VMSUMSHM UNSPEC_VMSUMUHS UNSPEC_VMSUMSHS UNSPEC_VMHADDSHS @@ -970,10 +971,20 @@ UNSPEC_VMSUMU))] "TARGET_ALTIVEC" "vmsumu<VI_char>m %0,%1,%2,%3" [(set_attr "type" "veccomplex")]) +(define_insn "altivec_vmsumudm" + [(set (match_operand:V1TI 0 "register_operand" "=v") + (unspec:V1TI [(match_operand:V2DI 1 "register_operand" "v") + (match_operand:V2DI 2 "register_operand" "v") + (match_operand:V1TI 3 "register_operand" "v")] + UNSPEC_VMSUMUDM))] + "TARGET_P9_VECTOR" + "vmsumudm %0,%1,%2,%3" + [(set_attr "type" "veccomplex")]) + (define_insn "altivec_vmsumm<VI_char>m" [(set (match_operand:V4SI 0 "register_operand" "=v") (unspec:V4SI [(match_operand:VIshort 1 "register_operand" "v") (match_operand:VIshort 2 "register_operand" "v") (match_operand:V4SI 3 "register_operand" "v")] diff --git a/gcc/config/rs6000/rs6000-builtin.def b/gcc/config/rs6000/rs6000-builtin.def index 363656ec05cc..ee0d787cfa22 100644 --- a/gcc/config/rs6000/rs6000-builtin.def +++ b/gcc/config/rs6000/rs6000-builtin.def @@ -1140,10 +1140,11 @@ BU_ALTIVEC_3 (VMHADDSHS, "vmhaddshs", SAT, altivec_vmhaddshs) BU_ALTIVEC_3 (VMHRADDSHS, "vmhraddshs", SAT, altivec_vmhraddshs) BU_ALTIVEC_3 (VMLADDUHM, "vmladduhm", CONST, fmav8hi4) BU_ALTIVEC_3 (VMSUMUBM, "vmsumubm", CONST, altivec_vmsumubm) BU_ALTIVEC_3 (VMSUMMBM, "vmsummbm", CONST, altivec_vmsummbm) BU_ALTIVEC_3 (VMSUMUHM, "vmsumuhm", CONST, altivec_vmsumuhm) +BU_ALTIVEC_3 (VMSUMUDM, "vmsumudm", CONST, altivec_vmsumudm) BU_ALTIVEC_3 (VMSUMSHM, "vmsumshm", CONST, altivec_vmsumshm) BU_ALTIVEC_3 (VMSUMUHS, "vmsumuhs", SAT, altivec_vmsumuhs) BU_ALTIVEC_3 (VMSUMSHS, "vmsumshs", SAT, altivec_vmsumshs) BU_ALTIVEC_3 (VNMSUBFP, "vnmsubfp", FP, nfmsv4sf4) BU_ALTIVEC_3 (VPERM_1TI, "vperm_1ti", CONST, altivec_vperm_v1ti) @@ -1497,10 +1498,11 @@ BU_ALTIVEC_OVERLOAD_3 (SEL, "sel") BU_ALTIVEC_OVERLOAD_3 (VMSUMMBM, "vmsummbm") BU_ALTIVEC_OVERLOAD_3 (VMSUMSHM, "vmsumshm") BU_ALTIVEC_OVERLOAD_3 (VMSUMSHS, "vmsumshs") BU_ALTIVEC_OVERLOAD_3 (VMSUMUBM, "vmsumubm") BU_ALTIVEC_OVERLOAD_3 (VMSUMUHM, "vmsumuhm") +BU_ALTIVEC_OVERLOAD_3 (VMSUMUDM, "vmsumudm") BU_ALTIVEC_OVERLOAD_3 (VMSUMUHS, "vmsumuhs") /* Altivec DST overloaded builtins. */ BU_ALTIVEC_OVERLOAD_D (DST, "dst") BU_ALTIVEC_OVERLOAD_D (DSTT, "dstt") diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c index d3cf2de88780..8e7bb54c73d1 100644 --- a/gcc/config/rs6000/rs6000-call.c +++ b/gcc/config/rs6000/rs6000-call.c @@ -3087,10 +3087,16 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = { RS6000_BTI_V4SI, RS6000_BTI_V16QI, RS6000_BTI_unsigned_V16QI, RS6000_BTI_V4SI }, { ALTIVEC_BUILTIN_VEC_MSUM, ALTIVEC_BUILTIN_VMSUMUHM, RS6000_BTI_unsigned_V4SI, RS6000_BTI_unsigned_V8HI, RS6000_BTI_unsigned_V8HI, RS6000_BTI_unsigned_V4SI }, { ALTIVEC_BUILTIN_VEC_MSUM, ALTIVEC_BUILTIN_VMSUMSHM, RS6000_BTI_V4SI, RS6000_BTI_V8HI, RS6000_BTI_V8HI, RS6000_BTI_V4SI }, + + { ALTIVEC_BUILTIN_VEC_MSUM, ALTIVEC_BUILTIN_VMSUMUDM, + RS6000_BTI_V1TI, RS6000_BTI_V2DI, RS6000_BTI_V2DI, RS6000_BTI_V1TI }, + { ALTIVEC_BUILTIN_VEC_MSUM, ALTIVEC_BUILTIN_VMSUMUDM, + RS6000_BTI_unsigned_V1TI, RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V1TI }, + { ALTIVEC_BUILTIN_VEC_VMSUMSHM, ALTIVEC_BUILTIN_VMSUMSHM, RS6000_BTI_V4SI, RS6000_BTI_V8HI, RS6000_BTI_V8HI, RS6000_BTI_V4SI }, { ALTIVEC_BUILTIN_VEC_VMSUMUHM, ALTIVEC_BUILTIN_VMSUMUHM, RS6000_BTI_unsigned_V4SI, RS6000_BTI_unsigned_V8HI, RS6000_BTI_unsigned_V8HI, RS6000_BTI_unsigned_V4SI }, { ALTIVEC_BUILTIN_VEC_VMSUMMBM, ALTIVEC_BUILTIN_VMSUMMBM, diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi index 5a57c1c49c56..8f75f60e7bc0 100644 --- a/gcc/doc/extend.texi +++ b/gcc/doc/extend.texi @@ -20211,10 +20211,17 @@ bool scalar_test_data_class (double source, const int condition); bool scalar_test_data_class (__ieee128 source, const int condition); bool scalar_test_neg (float source); bool scalar_test_neg (double source); bool scalar_test_neg (__ieee128 source); + +vector _uint128_t vec_msum (vector unsigned long long, + vector unsigned long long, + vector _uint128_t); +vector _int128_t vec_msum (vector signed long long, + vector signed long long, + vector _int128_t); @end smallexample The @code{scalar_extract_exp} and @code{scalar_extract_sig} functions require a 64-bit environment supporting ISA 3.0 or later. The @code{scalar_extract_exp} and @code{scalar_extract_sig} built-in @@ -20230,10 +20237,15 @@ When supplied with a 128-bit @code{source} argument, the treated similarly. Note that the sign of the significand is not represented in the result returned from the @code{scalar_extract_sig} function. Use the @code{scalar_test_neg} function to test the sign of its @code{double} argument. +The @code{vec_msum} functions perform a vector multiply-sum, returning +the result of the widening multiply (arg1*arg2) which is then horizontally +added to (arg3). ISA 3.0 adds support for the @code{vec_msum} builtin +taking long long args for the multiply, an int128 args for the addition, +and returning a vector int128 result. The @code{scalar_insert_exp} functions require a 64-bit environment supporting ISA 3.0 or later. When supplied with a 64-bit first argument, the @code{scalar_insert_exp} built-in function returns a double-precision diff --git a/gcc/testsuite/gcc.target/powerpc/builtins-msum-runnable.c b/gcc/testsuite/gcc.target/powerpc/builtins-msum-runnable.c new file mode 100644 index 000000000000..0fa5c319b6de --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/builtins-msum-runnable.c @@ -0,0 +1,74 @@ +/* { dg-do run { target { p9vector_hw } } } */ +/* { dg-options "-mdejagnu-cpu=power9 -O2" } */ + +#include <altivec.h> + +#ifdef DEBUG +#include <stdio.h> +#endif + +void abort (void); + +int +main() +{ + vector __uint128_t arg_uint128, result_uint128, expected_uint128; + vector __int128_t arg_int128, result_int128, expected_int128; + + arg_uint128[0] = 0x1627384950617243; + arg_uint128[0] = arg_uint128[0] << 64; + arg_uint128[0] |= 0x9405182930415263; + expected_uint128[0] = 0x1627384950617243; + expected_uint128[0] = expected_uint128[0] << 64; + expected_uint128[0] |= 0xb6b07e42a570e5fe; + vector unsigned long long arg_vull2 = {0x12345678,0x44445555}; + vector unsigned long long arg_vull3 = {0x6789abcd,0x66667777}; + result_uint128 = vec_msum (arg_vull2, arg_vull3, arg_uint128); + + if (result_uint128[0] != expected_uint128[0]) + { +#ifdef DEBUG + printf("result_uint128[0] doesn't match expected_u128[0]\n"); + printf("arg_vull2 %llx %llx \n", arg_vull2[0], arg_vull2[1]); + printf("arg_vull3 %llx %llx \n", arg_vull3[0], arg_vull3[1]); + printf("arg_uint128[0] = %llx ", arg_uint128[0] >> 64); + printf(" %llx\n", arg_uint128[0] & 0xFFFFFFFFFFFFFFFF); + + printf("result_uint128[0] = %llx ", result_uint128[0] >> 64); + printf(" %llx\n", result_uint128[0] & 0xFFFFFFFFFFFFFFFF); + + printf("expected_uint128[0] = %llx ", expected_uint128[0] >> 64); + printf(" %llx\n", expected_uint128[0] & 0xFFFFFFFFFFFFFFFF); +#else + abort(); +#endif + } + + arg_int128[0] = 0x1627384950617283; + arg_int128[0] = arg_int128[0] << 64; + arg_int128[0] |= 0x9405182930415263; + expected_int128[0] = 0x1627384950617283; + expected_int128[0] = expected_int128[0] << 64; + expected_int128[0] |= 0xd99f35969c11cbfa; + vector signed long long arg_vll2 = { 0x567890ab, 0x1233456 }; + vector signed long long arg_vll3 = { 0xcdef0123, 0x9873451 }; + result_int128 = vec_msum (arg_vll2, arg_vll3, arg_int128); + + if (result_int128[0] != expected_int128[0]) + { +#ifdef DEBUG + printf("result_int128[0] doesn't match expected128[0]\n"); + printf("arg_int128[0] = %llx ", arg_int128[0] >> 64); + printf(" %llx\n", arg_int128[0] & 0xFFFFFFFFFFFFFFFF); + + printf("result_int128[0] = %llx ", result_int128[0] >> 64); + printf(" %llx\n", result_int128[0] & 0xFFFFFFFFFFFFFFFF); + + printf("expected_int128[0] = %llx ", expected_int128[0] >> 64); + printf(" %llx\n", expected_int128[0] & 0xFFFFFFFFFFFFFFFF); +#else + abort(); +#endif + } +} + diff --git a/gcc/testsuite/gcc.target/powerpc/vsx-builtin-msum.c b/gcc/testsuite/gcc.target/powerpc/vsx-builtin-msum.c new file mode 100644 index 000000000000..1974864de00c --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/vsx-builtin-msum.c @@ -0,0 +1,25 @@ +/* Verify that overloaded built-ins for vec_msum with __int128 + inputs generate the proper code. */ + +/* { dg-do compile } */ +/* { dg-require-effective-target powerpc_p9vector_ok } */ +/* { dg-options "-mdejagnu-cpu=power9 -O3" } */ + +#include <altivec.h> + +vector signed __int128 +test_msum_si (vector signed long long vsll_1, vector signed long long vsll_2, + vector signed __int128 vsi128) +{ + return vec_msum (vsll_1, vsll_2, vsi128); +} + +vector unsigned __int128 +test_msum_ui (vector unsigned long long vull_1, vector unsigned long long vull_2, + vector unsigned __int128 vui128) +{ + return vec_msum (vull_1, vull_2, vui128); +} + +/* { dg_final { scan_assembler_times "vmsumudm" 2 } } */ +