https://gcc.gnu.org/g:c2013267642fea4a6e89b826940c8aa80a76089d
commit r16-4945-gc2013267642fea4a6e89b826940c8aa80a76089d Author: chenxiaolong <[email protected]> Date: Wed Oct 29 18:49:34 2025 +0800 LoongArch: Add builtin interfaces for 128 and 256 vector conversions. gcc/ChangeLog: * config/loongarch/lasx.md (vec_cast<mode>): New template implemention. (vec_insert_lo_<mode>): Dito. (vec_insert_hi_<mode>): Dito. * config/loongarch/lasxintrin.h (defined): Test for adding the builtin function. (__lasx_cast_128_s): Dito. (__lasx_cast_128_d): Dito. (__lasx_cast_128): Dito. (__lasx_concat_128_s): Dito. (__lasx_concat_128_d): Dito. (__lasx_concat_128): Dito. (__lasx_extract_128_lo_s): Dito. (__lasx_extract_128_hi_s): Dito. (__lasx_extract_128_lo_d): Dito. (__lasx_extract_128_hi_d): Dito. (__lasx_extract_128_lo): Dito. (__lasx_extract_128_hi): Dito. (__lasx_insert_128_lo_s): Dito. (__lasx_insert_128_hi_s): Dito. (__lasx_insert_128_lo_d): Dito. (__lasx_insert_128_hi_d): Dito. (__lasx_insert_128_lo): Dito. (__lasx_insert_128_hi): Dito. * config/loongarch/loongarch-builtins.cc (CODE_FOR_lasx_extract_128_lo_s): Add builtins and register icode. (CODE_FOR_lasx_extract_128_hi_s): Dito. (CODE_FOR_lasx_extract_128_lo_d): Dito. (CODE_FOR_lasx_extract_128_hi_d): Dito. (CODE_FOR_lasx_extract_128_lo): Dito. (CODE_FOR_lasx_extract_128_hi): Dito. (CODE_FOR_lasx_insert_128_lo_s): Dito. (CODE_FOR_lasx_insert_128_hi_s): Dito. (CODE_FOR_lasx_insert_128_lo_d): Dito. (CODE_FOR_lasx_insert_128_hi_d): Dito. (CODE_FOR_lasx_insert_128_lo): Dito. (CODE_FOR_lasx_insert_128_hi): Dito. (CODE_FOR_lasx_concat_128_s): Dito. (CODE_FOR_lasx_concat_128_d): Dito. (CODE_FOR_lasx_concat_128): Dito. (CODE_FOR_lasx_cast_128_s): Dito. (CODE_FOR_lasx_cast_128_d): Dito. (CODE_FOR_lasx_cast_128): Dito. (loongarch_expand_builtin_direct): For the newly added insertion or extraction, construct the parallel parameter corresponding to the operand. * config/loongarch/loongarch-c.cc (loongarch_update_cpp_builtins): Define __loongarch_asx_sx_conv. * config/loongarch/loongarch-ftypes.def: Declare the type of the builtin function. * doc/extend.texi: Add document description. gcc/testsuite/ChangeLog: * gcc.target/loongarch/vector/lasx/vect-concat-128-256-result.c: New test. * gcc.target/loongarch/vector/lasx/vect-concat-128-256.c: New test. * gcc.target/loongarch/vector/lasx/vect-extract-256-128-result.c: New test. * gcc.target/loongarch/vector/lasx/vect-extract-256-128.c: New test. * gcc.target/loongarch/vector/lasx/vect-insert-128-256-result.c: New test. * gcc.target/loongarch/vector/lasx/vect-insert-128-256.c: New test. Diff: --- gcc/config/loongarch/lasx.md | 36 +++ gcc/config/loongarch/lasxintrin.h | 156 +++++++++++++ gcc/config/loongarch/loongarch-builtins.cc | 90 ++++++- gcc/config/loongarch/loongarch-c.cc | 1 + gcc/config/loongarch/loongarch-ftypes.def | 12 + gcc/doc/extend.texi | 258 ++++++++++++++++++++- .../vector/lasx/vect-concat-128-256-result.c | 68 ++++++ .../loongarch/vector/lasx/vect-concat-128-256.c | 92 ++++++++ .../vector/lasx/vect-extract-256-128-result.c | 69 ++++++ .../loongarch/vector/lasx/vect-extract-256-128.c | 86 +++++++ .../vector/lasx/vect-insert-128-256-result.c | 97 ++++++++ .../loongarch/vector/lasx/vect-insert-128-256.c | 95 ++++++++ 12 files changed, 1058 insertions(+), 2 deletions(-) diff --git a/gcc/config/loongarch/lasx.md b/gcc/config/loongarch/lasx.md index 7a91473e4020..7704f8c798e5 100644 --- a/gcc/config/loongarch/lasx.md +++ b/gcc/config/loongarch/lasx.md @@ -130,6 +130,7 @@ ;; Only used for splitting insert_d and copy_{u,s}.d. (define_mode_iterator LASX_WD [V4DI V4DF V8SI V8SF]) +(define_mode_iterator LASX_PART [V4DI V4DF V8SF]) ;; Only used for copy256_{u,s}.w. (define_mode_iterator LASX_W [V8SI V8SF]) @@ -672,6 +673,41 @@ [(set_attr "move_type" "fmove") (set_attr "mode" "<MODE>")]) +;; vr0 -> low xr0 +;; +(define_insn "vec_cast<mode>" + [(set (match_operand:LASX_PART 0 "register_operand" "=f") + (subreg:LASX_PART + (match_operand:<VHMODE256_ALL> 1 "register_operand" "0") 0))] + "ISA_HAS_LASX" + "" + [(set_attr "type" "simd_splat") + (set_attr "mode" "<MODE>")]) + +(define_insn "vec_insert_lo_<mode>" + [(set (match_operand:LASX_PART 0 "register_operand" "=f") + (vec_concat:LASX_PART + (match_operand:<VHMODE256_ALL> 2 "register_operand" "f") + (vec_select:<VHMODE256_ALL> + (match_operand:LASX_PART 1 "register_operand" "0") + (match_operand:LASX_PART 3 "vect_par_cnst_high_half"))))] + "ISA_HAS_LASX" + "xvpermi.q\t%u0,%u2,0x30" + [(set_attr "type" "simd_splat") + (set_attr "mode" "<MODE>")]) + +(define_insn "vec_insert_hi_<mode>" + [(set (match_operand:LASX_PART 0 "register_operand" "=f") + (vec_concat:LASX_PART + (vec_select:<VHMODE256_ALL> + (match_operand:LASX_PART 1 "register_operand" "0") + (match_operand:LASX_PART 3 "vect_par_cnst_low_half")) + (match_operand:<VHMODE256_ALL> 2 "register_operand" "f")))] + "ISA_HAS_LASX" + "xvpermi.q\t%u0,%u2,0x02" + [(set_attr "type" "simd_splat") + (set_attr "mode" "<MODE>")]) + (define_expand "vec_perm<mode>" [(match_operand:LASX 0 "register_operand") (match_operand:LASX 1 "register_operand") diff --git a/gcc/config/loongarch/lasxintrin.h b/gcc/config/loongarch/lasxintrin.h index 6bcffc26d4a5..6c34edeec25a 100644 --- a/gcc/config/loongarch/lasxintrin.h +++ b/gcc/config/loongarch/lasxintrin.h @@ -23,6 +23,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see <http://www.gnu.org/licenses/>. */ +#include <lsxintrin.h> + #ifndef _GCC_LOONGSON_ASXINTRIN_H #define _GCC_LOONGSON_ASXINTRIN_H 1 @@ -5368,5 +5370,159 @@ __m256i __lasx_xvfcmp_sun_s (__m256 _1, __m256 _2) #define __lasx_xvrepli_w(/*si10*/ _1) \ ((__m256i)__builtin_lasx_xvrepli_w ((_1))) +#if defined (__loongarch_asx_sx_conv) +/* Add builtin interfaces for 128 and 256 vector conversions. + For the assembly instruction format of some functions of the following vector + conversion, it is not described exactly in accordance with the format of the + generated assembly instruction. + In the front end of the Rust language, different built-in functions are called + by analyzing the format of assembly instructions. The data types of instructions + are all defined based on the interfaces of the defined functions, in the + following order: output, input... . */ +/* Assembly instruction format: xd, vj. */ +/* Data types in instruction templates: V8SF, V4SF. */ +extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__m256 __lasx_cast_128_s (__m128 _1) +{ + return (__m256)__builtin_lasx_cast_128_s ((v4f32)_1); +} + +/* Assembly instruction format: xd, vj. */ +/* Data types in instruction templates: V4DF, V2DF. */ +extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__m256d __lasx_cast_128_d (__m128d _1) +{ + return (__m256d)__builtin_lasx_cast_128_d ((v2f64)_1); +} + +/* Assembly instruction format: xd, vj. */ +/* Data types in instruction templates: V4DI, V2DI. */ +extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__m256i __lasx_cast_128 (__m128i _1) +{ + return (__m256i)__builtin_lasx_cast_128 ((v2i64)_1); +} + +/* Assembly instruction format: xd, vj, vk. */ +/* Data types in instruction templates: V8SF, V4SF, V4SF. */ +extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__m256 __lasx_concat_128_s (__m128 _1, __m128 _2) +{ + return (__m256)__builtin_lasx_concat_128_s ((v4f32)_1, (v4f32)_2); +} + +/* Assembly instruction format: xd, vj, vk. */ +/* Data types in instruction templates: V4DF, V2DF, V2DF. */ +extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__m256d __lasx_concat_128_d (__m128d _1, __m128d _2) +{ + return (__m256d)__builtin_lasx_concat_128_d ((v2f64)_1, (v2f64)_2); +} + +/* Assembly instruction format: xd, vj, vk. */ +/* Data types in instruction templates: V4DI, V2DI, V2DI. */ +extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__m256i __lasx_concat_128 (__m128i _1, __m128i _2) +{ + return (__m256i)__builtin_lasx_concat_128 ((v2i64)_1, (v2i64)_2); +} + +/* Assembly instruction format: vd, xj. */ +/* Data types in instruction templates: V4SF, V8SF. */ +extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__m128 __lasx_extract_128_lo_s (__m256 _1) +{ + return (__m128)__builtin_lasx_extract_128_lo_s ((v8f32)_1); +} + +/* Assembly instruction format: vd, xj. */ +/* Data types in instruction templates: V4SF, V8SF. */ +extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__m128 __lasx_extract_128_hi_s (__m256 _1) +{ + return (__m128)__builtin_lasx_extract_128_hi_s ((v8f32)_1); +} + +/* Assembly instruction format: vd, xj. */ +/* Data types in instruction templates: V2DF, V4DF. */ +extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__m128d __lasx_extract_128_lo_d (__m256d _1) +{ + return (__m128d)__builtin_lasx_extract_128_lo_d ((v4f64)_1); +} + +/* Assembly instruction format: vd, xj. */ +/* Data types in instruction templates: V2DF, V4DF. */ +extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__m128d __lasx_extract_128_hi_d (__m256d _1) +{ + return (__m128d)__builtin_lasx_extract_128_hi_d ((v4f64)_1); +} + +/* Assembly instruction format: vd, xj. */ +/* Data types in instruction templates: V2DI, V4DI. */ +extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__m128i __lasx_extract_128_lo (__m256i _1) +{ + return (__m128i)__builtin_lasx_extract_128_lo ((v4i64)_1); +} + +/* Assembly instruction format: vd, xj. */ +/* Data types in instruction templates: V2DI, V4DI. */ +extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__m128i __lasx_extract_128_hi (__m256i _1) +{ + return (__m128i)__builtin_lasx_extract_128_hi ((v4i64)_1); +} + +/* Assembly instruction format: xd, xj, vk. */ +/* Data types in instruction templates: V8SF, V8SF, V4SF. */ +extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__m256 __lasx_insert_128_lo_s (__m256 _1, __m128 _2) +{ + return (__m256)__builtin_lasx_insert_128_lo_s ((v8f32)_1, (v4f32)_2); +} + +/* Assembly instruction format: xd, xj, vk. */ +/* Data types in instruction templates: V8SF, V8SF, V4SF. */ +extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__m256 __lasx_insert_128_hi_s (__m256 _1, __m128 _2) +{ + return (__m256)__builtin_lasx_insert_128_hi_s ((v8f32)_1, (v4f32)_2); +} + +/* Assembly instruction format: xd, xj, vk. */ +/* Data types in instruction templates: V4DF, V4DF, V2DF. */ +extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__m256d __lasx_insert_128_lo_d (__m256d _1, __m128d _2) +{ + return (__m256d)__builtin_lasx_insert_128_lo_d ((v4f64)_1, (v2f64)_2); +} + +/* Assembly instruction format: xd, xj, vk. */ +/* Data types in instruction templates: V4DF, V4DF, V2DF. */ +extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__m256d __lasx_insert_128_hi_d (__m256d _1, __m128d _2) +{ + return (__m256d)__builtin_lasx_insert_128_hi_d ((v4f64)_1, (v2f64)_2); +} + +/* Assembly instruction format: xd, xj, vk. */ +/* Data types in instruction templates: V4DI, V4DI, V2DI. */ +extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__m256i __lasx_insert_128_lo (__m256i _1, __m128i _2) +{ + return (__m256i)__builtin_lasx_insert_128_lo ((v4i64)_1, (v2i64)_2); +} + +/* Assembly instruction format: xd, xj, vk. */ +/* Data types in instruction templates: V4DI, V4DI, V2DI. */ +extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__m256i __lasx_insert_128_hi (__m256i _1, __m128i _2) +{ + return (__m256i)__builtin_lasx_insert_128_hi ((v4i64)_1, (v2i64)_2); +} + +#endif /* defined(__loongarch_asx_sx_conv). */ #endif /* defined(__loongarch_asx). */ #endif /* _GCC_LOONGSON_ASXINTRIN_H. */ diff --git a/gcc/config/loongarch/loongarch-builtins.cc b/gcc/config/loongarch/loongarch-builtins.cc index 9493dedcab78..312d87626a4b 100644 --- a/gcc/config/loongarch/loongarch-builtins.cc +++ b/gcc/config/loongarch/loongarch-builtins.cc @@ -865,6 +865,27 @@ AVAIL_ALL (lasx_frecipe, ISA_HAS_LASX && ISA_HAS_FRECIPE) #define CODE_FOR_lasx_xvmaddwod_q_du CODE_FOR_lasx_maddwod_q_du_punned #define CODE_FOR_lasx_xvmaddwod_q_du_d CODE_FOR_lasx_maddwod_q_du_d_punned + +/* Add mutual conversion between 128 and 256 vectors. */ +#define CODE_FOR_lasx_extract_128_lo_s CODE_FOR_vec_extract_lo_v8sf +#define CODE_FOR_lasx_extract_128_hi_s CODE_FOR_vec_extract_hi_v8sf +#define CODE_FOR_lasx_extract_128_lo_d CODE_FOR_vec_extract_lo_v4df +#define CODE_FOR_lasx_extract_128_hi_d CODE_FOR_vec_extract_hi_v4df +#define CODE_FOR_lasx_extract_128_lo CODE_FOR_vec_extract_lo_v4di +#define CODE_FOR_lasx_extract_128_hi CODE_FOR_vec_extract_hi_v4di +#define CODE_FOR_lasx_insert_128_lo_s CODE_FOR_vec_insert_lo_v8sf +#define CODE_FOR_lasx_insert_128_hi_s CODE_FOR_vec_insert_hi_v8sf +#define CODE_FOR_lasx_insert_128_lo_d CODE_FOR_vec_insert_lo_v4df +#define CODE_FOR_lasx_insert_128_hi_d CODE_FOR_vec_insert_hi_v4df +#define CODE_FOR_lasx_insert_128_lo CODE_FOR_vec_insert_lo_v4di +#define CODE_FOR_lasx_insert_128_hi CODE_FOR_vec_insert_hi_v4di +#define CODE_FOR_lasx_concat_128_s CODE_FOR_vec_concatv8sf +#define CODE_FOR_lasx_concat_128_d CODE_FOR_vec_concatv4df +#define CODE_FOR_lasx_concat_128 CODE_FOR_vec_concatv4di +#define CODE_FOR_lasx_cast_128_s CODE_FOR_vec_castv8sf +#define CODE_FOR_lasx_cast_128_d CODE_FOR_vec_castv4df +#define CODE_FOR_lasx_cast_128 CODE_FOR_vec_castv4di + static const struct loongarch_builtin_description loongarch_builtins[] = { #define LARCH_MOVFCSR2GR 0 DIRECT_BUILTIN (movfcsr2gr, LARCH_USI_FTYPE_UQI, hard_float), @@ -2407,7 +2428,25 @@ static const struct loongarch_builtin_description loongarch_builtins[] = { LASX_BUILTIN (xvssrarni_bu_h, LARCH_UV32QI_FTYPE_UV32QI_V32QI_USI), LASX_BUILTIN (xvssrarni_hu_w, LARCH_UV16HI_FTYPE_UV16HI_V16HI_USI), LASX_BUILTIN (xvssrarni_wu_d, LARCH_UV8SI_FTYPE_UV8SI_V8SI_USI), - LASX_BUILTIN (xvssrarni_du_q, LARCH_UV4DI_FTYPE_UV4DI_V4DI_USI) + LASX_BUILTIN (xvssrarni_du_q, LARCH_UV4DI_FTYPE_UV4DI_V4DI_USI), + LASX_BUILTIN (extract_128_lo_s, LARCH_V4SF_FTYPE_V8SF), + LASX_BUILTIN (extract_128_hi_s, LARCH_V4SF_FTYPE_V8SF), + LASX_BUILTIN (extract_128_lo_d, LARCH_V2DF_FTYPE_V4DF), + LASX_BUILTIN (extract_128_hi_d, LARCH_V2DF_FTYPE_V4DF), + LASX_BUILTIN (extract_128_lo, LARCH_V2DI_FTYPE_V4DI), + LASX_BUILTIN (extract_128_hi, LARCH_V2DI_FTYPE_V4DI), + LASX_BUILTIN (insert_128_lo_s, LARCH_V8SF_FTYPE_V8SF_V4SF), + LASX_BUILTIN (insert_128_hi_s, LARCH_V8SF_FTYPE_V8SF_V4SF), + LASX_BUILTIN (insert_128_lo_d, LARCH_V4DF_FTYPE_V4DF_V2DF), + LASX_BUILTIN (insert_128_hi_d, LARCH_V4DF_FTYPE_V4DF_V2DF), + LASX_BUILTIN (insert_128_lo, LARCH_V4DI_FTYPE_V4DI_V2DI), + LASX_BUILTIN (insert_128_hi, LARCH_V4DI_FTYPE_V4DI_V2DI), + LASX_BUILTIN (concat_128_s, LARCH_V8SF_FTYPE_V4SF_V4SF), + LASX_BUILTIN (concat_128_d, LARCH_V4DF_FTYPE_V2DF_V2DF), + LASX_BUILTIN (concat_128, LARCH_V4DI_FTYPE_V2DI_V2DI), + LASX_BUILTIN (cast_128_s, LARCH_V8SF_FTYPE_V4SF), + LASX_BUILTIN (cast_128_d, LARCH_V4DF_FTYPE_V2DF), + LASX_BUILTIN (cast_128, LARCH_V4DI_FTYPE_V2DI) }; /* Index I is the function declaration for loongarch_builtins[I], or null if @@ -3001,6 +3040,10 @@ loongarch_expand_builtin_direct (enum insn_code icode, rtx target, tree exp, { struct expand_operand ops[MAX_RECOG_OPERANDS]; int opno, argno; + /* For vector extraction/insertion operations, sel_high_p being true + indicates that the high of the data is selected/retained from the + vector register. */ + bool sel_high_p = true; /* Map any target to operand 0. */ opno = 0; @@ -3019,6 +3062,51 @@ loongarch_expand_builtin_direct (enum insn_code icode, rtx target, tree exp, create_input_operand (&ops[1], CONST1_RTX (ops[0].mode), ops[0].mode); return loongarch_expand_builtin_insn (icode, 3, ops, has_target_p); + case CODE_FOR_vec_extract_lo_v8sf: + case CODE_FOR_vec_extract_lo_v4df: + case CODE_FOR_vec_extract_lo_v4di: + sel_high_p = false; + /* Fall through. */ + case CODE_FOR_vec_extract_hi_v8sf: + case CODE_FOR_vec_extract_hi_v4df: + case CODE_FOR_vec_extract_hi_v4di: + { + /* The selection method for constructing the high/low half. */ + loongarch_prepare_builtin_arg (&ops[1], exp, 0); + int nelts = GET_MODE_NUNITS (GET_MODE (ops[1].value)); + int half_nelts = nelts / 2; + int base = sel_high_p ? half_nelts : 0; + + rtx pat_rtx + = loongarch_gen_stepped_int_parallel (half_nelts, base, 1); + create_input_operand (&ops[2], pat_rtx, ops[1].mode); + + return loongarch_expand_builtin_insn (icode, 3, ops, has_target_p); + } + + case CODE_FOR_vec_insert_hi_v8sf: + case CODE_FOR_vec_insert_hi_v4df: + case CODE_FOR_vec_insert_hi_v4di: + sel_high_p = false; + /* Fall through. */ + case CODE_FOR_vec_insert_lo_v8sf: + case CODE_FOR_vec_insert_lo_v4df: + case CODE_FOR_vec_insert_lo_v4di: + { + /* The selection method for constructing the high/low half. */ + loongarch_prepare_builtin_arg (&ops[1], exp, 0); + loongarch_prepare_builtin_arg (&ops[2], exp, 1); + int nelts = GET_MODE_NUNITS (GET_MODE (ops[1].value)); + int half_nelts = nelts / 2; + int base = sel_high_p ? half_nelts : 0; + + rtx pat_rtx + = loongarch_gen_stepped_int_parallel (half_nelts, base, 1); + create_input_operand (&ops[3], pat_rtx, ops[1].mode); + + return loongarch_expand_builtin_insn (icode, 4, ops, has_target_p); + } + default: break; } diff --git a/gcc/config/loongarch/loongarch-c.cc b/gcc/config/loongarch/loongarch-c.cc index effdcf0e2554..fc031a6fe90e 100644 --- a/gcc/config/loongarch/loongarch-c.cc +++ b/gcc/config/loongarch/loongarch-c.cc @@ -132,6 +132,7 @@ loongarch_update_cpp_builtins (cpp_reader *pfile) loongarch_def_or_undef (ISA_HAS_LSX, "__loongarch_simd", pfile); loongarch_def_or_undef (ISA_HAS_LSX, "__loongarch_sx", pfile); loongarch_def_or_undef (ISA_HAS_LASX, "__loongarch_asx", pfile); + loongarch_def_or_undef (ISA_HAS_LASX, "__loongarch_asx_sx_conv", pfile); builtin_undef ("__loongarch_simd_width"); if (ISA_HAS_LSX) diff --git a/gcc/config/loongarch/loongarch-ftypes.def b/gcc/config/loongarch/loongarch-ftypes.def index 337f2c2c2293..68b1b446182c 100644 --- a/gcc/config/loongarch/loongarch-ftypes.def +++ b/gcc/config/loongarch/loongarch-ftypes.def @@ -42,6 +42,12 @@ DEF_LARCH_FTYPE (1, (USI, USI)) DEF_LARCH_FTYPE (1, (UDI, USI)) DEF_LARCH_FTYPE (1, (USI, UQI)) DEF_LARCH_FTYPE (1, (VOID, USI)) +DEF_LARCH_FTYPE (1, (V4SF, V8SF)) +DEF_LARCH_FTYPE (1, (V2DF, V4DF)) +DEF_LARCH_FTYPE (1, (V2DI, V4DI)) +DEF_LARCH_FTYPE (1, (V8SF, V4SF)) +DEF_LARCH_FTYPE (1, (V4DF, V2DF)) +DEF_LARCH_FTYPE (1, (V4DI, V2DI)) DEF_LARCH_FTYPE (2, (VOID, UQI, USI)) DEF_LARCH_FTYPE (2, (VOID, UHI, USI)) @@ -58,6 +64,12 @@ DEF_LARCH_FTYPE (2, (SI, SI, SI)) DEF_LARCH_FTYPE (2, (SI, DI, SI)) DEF_LARCH_FTYPE (2, (USI, USI, USI)) DEF_LARCH_FTYPE (2, (UDI, UDI, USI)) +DEF_LARCH_FTYPE (2, (V8SF, V4SF, V4SF)) +DEF_LARCH_FTYPE (2, (V4DF, V2DF, V2DF)) +DEF_LARCH_FTYPE (2, (V4DI, V2DI, V2DI)) +DEF_LARCH_FTYPE (2, (V8SF, V8SF, V4SF)) +DEF_LARCH_FTYPE (2, (V4DF, V4DF, V2DF)) +DEF_LARCH_FTYPE (2, (V4DI, V4DI, V2DI)) DEF_LARCH_FTYPE (3, (VOID, USI, USI, SI)) DEF_LARCH_FTYPE (3, (VOID, USI, UDI, SI)) diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi index 30eae4bdacce..00273c0b673f 100644 --- a/gcc/doc/extend.texi +++ b/gcc/doc/extend.texi @@ -19701,7 +19701,16 @@ into the data cache. The instruction is issued in slot I1@. These built-in functions are available for LoongArch. -Data Type Description: +@menu +* Data Types:: +* Directly-mapped Builtin Functions:: +* Directly-mapped Division Builtin Functions:: +* Other Builtin Functions:: +@end menu + +@node Data Types +@subsubsection Data Types + @itemize @item @code{imm0_31}, a compile-time constant in range 0 to 31; @item @code{imm0_16383}, a compile-time constant in range 0 to 16383; @@ -19709,6 +19718,9 @@ Data Type Description: @item @code{imm_n2048_2047}, a compile-time constant in range -2048 to 2047; @end itemize +@node Directly-mapped Builtin Functions +@subsubsection Directly-mapped Builtin Functions + The intrinsics provided are listed below: @smallexample unsigned int __builtin_loongarch_movfcsr2gr (imm0_31) @@ -19832,6 +19844,9 @@ function you need to include @code{larchintrin.h}. void __break (imm0_32767) @end smallexample +@node Directly-mapped Division Builtin Functions +@subsubsection Directly-mapped Division Builtin Functions + These intrinsic functions are available by including @code{larchintrin.h} and using @option{-mfrecipe}. @smallexample @@ -19841,6 +19856,9 @@ using @option{-mfrecipe}. double __frsqrte_d (double); @end smallexample +@node Other Builtin Functions +@subsubsection Other Builtin Functions + Additional built-in functions are available for LoongArch family processors to efficiently use 128-bit floating-point (__float128) values. @@ -19867,6 +19885,15 @@ GCC provides intrinsics to access the LSX (Loongson SIMD Extension) instructions The interface is made available by including @code{<lsxintrin.h>} and using @option{-mlsx}. +@menu +* SX Data Types:: +* Directly-mapped SX Builtin Functions:: +* Directly-mapped SX Division Builtin Functions:: +@end menu + +@node SX Data Types +@subsubsection SX Data Types + The following vectors typedefs are included in @code{lsxintrin.h}: @itemize @@ -19894,6 +19921,9 @@ input/output values manipulated: @item @code{imm_n2048_2047}, an integer literal in range -2048 to 2047. @end itemize +@node Directly-mapped SX Builtin Functions +@subsubsection Directly-mapped SX Builtin Functions + For convenience, GCC defines functions @code{__lsx_vrepli_@{b/h/w/d@}} and @code{__lsx_b[n]z_@{v/b/h/w/d@}}, which are implemented as follows: @@ -20677,6 +20707,9 @@ __m128i __lsx_vxori_b (__m128i, imm0_255); __m128i __lsx_vxor_v (__m128i, __m128i); @end smallexample +@node Directly-mapped SX Division Builtin Functions +@subsubsection Directly-mapped SX Division Builtin Functions + These intrinsic functions are available by including @code{lsxintrin.h} and using @option{-mfrecipe} and @option{-mlsx}. @smallexample @@ -20693,6 +20726,16 @@ GCC provides intrinsics to access the LASX (Loongson Advanced SIMD Extension) instructions. The interface is made available by including @code{<lasxintrin.h>} and using @option{-mlasx}. +@menu +* ASX Data Types:: +* Directly-mapped ASX Builtin Functions:: +* Directly-mapped ASX Division Builtin Functions:: +* Directly-mapped SX and ASX Conversion Builtin Functions:: +@end menu + +@node ASX Data Types +@subsubsection ASX Data Types + The following vectors typedefs are included in @code{lasxintrin.h}: @itemize @@ -20721,6 +20764,9 @@ input/output values manipulated: @item @code{imm_n2048_2047}, an integer literal in range -2048 to 2047. @end itemize +@node Directly-mapped ASX Builtin Functions +@subsubsection Directly-mapped ASX Builtin Functions + For convenience, GCC defines functions @code{__lasx_xvrepli_@{b/h/w/d@}} and @code{__lasx_b[n]z_@{v/b/h/w/d@}}, which are implemented as follows: @@ -21525,6 +21571,9 @@ __m256i __lasx_xvxori_b (__m256i, imm0_255); __m256i __lasx_xvxor_v (__m256i, __m256i); @end smallexample +@node Directly-mapped ASX Division Builtin Functions +@subsubsection Directly-mapped ASX Division Builtin Functions + These intrinsic functions are available by including @code{lasxintrin.h} and using @option{-mfrecipe} and @option{-mlasx}. @smallexample @@ -21534,6 +21583,213 @@ __m256d __lasx_xvfrsqrte_d (__m256d); __m256 __lasx_xvfrsqrte_s (__m256); @end smallexample +@node Directly-mapped SX and ASX Conversion Builtin Functions +@subsubsection Directly-mapped SX and ASX Conversion Builtin Functions + +For convenience, the @code{lsxintrin.h} file was imported into @code{ +lasxintrin.h} and 18 new interface functions for 128 and 256 vector +conversions were added, using the @option{-mlasx} option. +@smallexample +__m256 __lasx_cast_128_s (__m128); +__m256d __lasx_cast_128_d (__m128d); +__m256i __lasx_cast_128 (__m128i); +__m256 __lasx_concat_128_s (__m128, __m128); +__m256d __lasx_concat_128_d (__m128d, __m128d); +__m256i __lasx_concat_128 (__m128i, __m128i); +__m128 __lasx_extract_128_lo_s (__m256); +__m128 __lasx_extract_128_hi_s (__m256); +__m128d __lasx_extract_128_lo_d (__m256d); +__m128d __lasx_extract_128_hi_d (__m256d); +__m128i __lasx_extract_128_lo (__m256i); +__m128i __lasx_extract_128_hi (__m256i); +__m256 __lasx_insert_128_lo_s (__m256, __m128); +__m256 __lasx_insert_128_hi_s (__m256, __m128); +__m256d __lasx_insert_128_lo_d (__m256d, __m128d); +__m256d __lasx_insert_128_hi_d (__m256d, __m128d); +__m256i __lasx_insert_128_lo (__m256i, __m128i); +__m256i __lasx_insert_128_hi (__m256i, __m128i); +@end smallexample + +When gcc does not support interfaces for 128 and 256 conversions, +use the following code for equivalent substitution. + +@smallexample + + #ifndef __loongarch_asx_sx_conv + + #include <lasxintrin.h> + #include <lsxintrin.h> + __m256 inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __lasx_cast_128_s (__m128 src) + @{ + __m256 dest; + asm ("" : "=f"(dest) : "0"(src)); + return dest; + @} + + __m256d inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __lasx_cast_128_d (__m128d src) + @{ + __m256d dest; + asm ("" : "=f"(dest) : "0"(src)); + return dest; + @} + + __m256i inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __lasx_cast_128 (__m128i src) + @{ + __m256i dest; + asm ("" : "=f"(dest) : "0"(src)); + return dest; + @} + + __m256 inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __lasx_concat_128_s (__m128 src1, __m128 src2) + @{ + __m256 dest; + asm ("xvpermi.q %u0,%u2,0x02\n" + : "=f"(dest) + : "0"(src1), "f"(src2)); + return dest; + @} + + __m256d inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __lasx_concat_128_d (__m128d src1, __m128d src2) + @{ + __m256d dest; + asm ("xvpermi.q %u0,%u2,0x02\n" + : "=f"(dest) + : "0"(src1), "f"(src2)); + return dest; + @} + + __m256i inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __lasx_concat_128 (__m128i src1, __m128i src2) + @{ + __m256i dest; + asm ("xvpermi.q %u0,%u2,0x02\n" + : "=f"(dest) + : "0"(src1), "f"(src2)); + return dest; + @} + + __m128 inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __lasx_extract_128_lo_s (__m256 src) + @{ + __m128 dest; + asm ("" : "=f"(dest) : "0"(src)); + return dest; + @} + + __m128d inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __lasx_extract_128_lo_d (__m256d src) + @{ + __m128d dest; + asm ("" : "=f"(dest) : "0"(src)); + return dest; + @} + + __m128i inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __lasx_extract_128_lo (__m256i src) + @{ + __m128i dest; + asm ("" : "=f"(dest) : "0"(src)); + return dest; + @} + + __m128 inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __lasx_extract_128_hi_s (__m256 src) + @{ + __m128 dest; + asm ("xvpermi.d %u0,%u1,0xe\n" + : "=f"(dest) + : "f"(src)); + return dest; + @} + + __m128d inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __lasx_extract_128_hi_d (__m256d src) + @{ + __m128d dest; + asm ("xvpermi.d %u0,%u1,0xe\n" + : "=f"(dest) + : "f"(src)); + return dest; + @} + + __m128i inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __lasx_extract_128_hi (__m256i src) + @{ + __m128i dest; + asm ("xvpermi.d %u0,%u1,0xe\n" + : "=f"(dest) + : "f"(src)); + return dest; + @} + + __m256 inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __lasx_insert_128_lo_s (__m256 src1, __m128 src2) + @{ + __m256 dest; + asm ("xvpermi.q %u0,%u2,0x30\n" + : "=f"(dest) + : "0"(src1), "f"(src2)); + return dest; + @} + + __m256d inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __lasx_insert_128_lo_d (__m256d a, __m128d b) + @{ + __m256d dest; + asm ("xvpermi.q %u0,%u2,0x30\n" + : "=f"(dest) + : "0"(src1), "f"(src2)); + return dest; + @} + + __m256i inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __lasx_insert_128_lo (__m256i src1, __m128i src2) + @{ + __m256i dest; + asm ("xvpermi.q %u0,%u2,0x30\n" + : "=f"(dest) + : "0"(src1), "f"(src2)); + return dest; + @} + + __m256 inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __lasx_insert_128_hi_s (__m256 src1, __m128 src2) + @{ + __m256 dest; + asm ("xvpermi.q %u0,%u2,0x02\n" + : "=f"(dest) + : "0"(src1), "f"(src2)); + return dest; + @} + + __m256d inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __lasx_insert_128_hi_d (__m256d src1, __m128d src2) + @{ + __m256d dest; + asm ("xvpermi.q %u0,%u2,0x02\n" + : "=f"(dest) + : "0"(src1), "f"(src2)); + return dest; + @} + + __m256i inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __lasx_insert_128_hi (__m256i src1, __m128i src2) + @{ + __m256i dest; + asm ("xvpermi.q %u0,%u2,0x02\n" + : "=f"(dest) + : "0"(src1), "f"(src2)); + return dest; + @} + #endif + +@end smallexample + @node MIPS DSP Built-in Functions @subsection MIPS DSP Built-in Functions diff --git a/gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-concat-128-256-result.c b/gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-concat-128-256-result.c new file mode 100644 index 000000000000..e876c4a0b1a4 --- /dev/null +++ b/gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-concat-128-256-result.c @@ -0,0 +1,68 @@ +/* { dg-options "-mabi=lp64d -O2 -mlasx -w -fno-strict-aliasing" } */ + +#include "../simd_correctness_check.h" +#include <lasxintrin.h> + +int +main () +{ + __m128i __m128i_op0, __m128i_op1, __m128i_op2, __m128i_out, __m128i_result; + __m128 __m128_op0, __m128_op1, __m128_op2, __m128_out, __m128_result; + __m128d __m128d_op0, __m128d_op1, __m128d_op2, __m128d_out, __m128d_result; + + __m256i __m256i_op0, __m256i_op1, __m256i_op2, __m256i_out, __m256i_result; + __m256 __m256_op0, __m256_op1, __m256_op2, __m256_out, __m256_result; + __m256d __m256d_op0, __m256d_op1, __m256d_op2, __m256d_out, __m256d_result; + + //__m128_op0={1,2,3,4},__m128_op1={5,6,7,8}; + *((int *)&__m128_op0[3]) = 0x40800000; + *((int *)&__m128_op0[2]) = 0x40400000; + *((int *)&__m128_op0[1]) = 0x40000000; + *((int *)&__m128_op0[0]) = 0x3f800000; + *((int *)&__m128_op1[3]) = 0x41000000; + *((int *)&__m128_op1[2]) = 0x40e00000; + *((int *)&__m128_op1[1]) = 0x40c00000; + *((int *)&__m128_op1[0]) = 0x40a00000; + *((int *)&__m256_result[7]) = 0x41000000; + *((int *)&__m256_result[6]) = 0x40e00000; + *((int *)&__m256_result[5]) = 0x40c00000; + *((int *)&__m256_result[4]) = 0x40a00000; + *((int *)&__m256_result[3]) = 0x40800000; + *((int *)&__m256_result[2]) = 0x40400000; + *((int *)&__m256_result[1]) = 0x40000000; + *((int *)&__m256_result[0]) = 0x3f800000; + __m256_out = __lasx_concat_128_s (__m128_op0, __m128_op1); + ASSERTEQ_32 (__LINE__, __m256_result, __m256_out); + __m256_out = __lasx_cast_128_s (__m128_op0); + ASSERTEQ_32 (__LINE__, __m256_out, __m128_op0); + + //__m128i_op0={1,2},__m128i_op1={3,4}; + *((unsigned long *)&__m128i_op0[1]) = 0x2; + *((unsigned long *)&__m128i_op0[0]) = 0x1; + *((unsigned long *)&__m128i_op1[1]) = 0x4; + *((unsigned long *)&__m128i_op1[0]) = 0x3; + *((unsigned long *)&__m256i_result[3]) = 0x4; + *((unsigned long *)&__m256i_result[2]) = 0x3; + *((unsigned long *)&__m256i_result[1]) = 0x2; + *((unsigned long *)&__m256i_result[0]) = 0x1; + __m256i_out = __lasx_concat_128 (__m128i_op0, __m128i_op1); + ASSERTEQ_64 (__LINE__, __m256i_result, __m256i_out); + __m256i_out = __lasx_cast_128 (__m128i_op0); + ASSERTEQ_64 (__LINE__, __m256i_out, __m128i_op0); + + //__m128d_op0={1,2},__m128i_op1={3,4}; + *((unsigned long *)&__m128d_op0[1]) = 0x4000000000000000; + *((unsigned long *)&__m128d_op0[0]) = 0x3ff0000000000000; + *((unsigned long *)&__m128d_op1[1]) = 0x4010000000000000; + *((unsigned long *)&__m128d_op1[0]) = 0x4008000000000000; + *((unsigned long *)&__m256d_result[3]) = 0x4010000000000000; + *((unsigned long *)&__m256d_result[2]) = 0x4008000000000000; + *((unsigned long *)&__m256d_result[1]) = 0x4000000000000000; + *((unsigned long *)&__m256d_result[0]) = 0x3ff0000000000000; + __m256d_out = __lasx_concat_128_d (__m128d_op0, __m128d_op1); + ASSERTEQ_64 (__LINE__, __m256d_result, __m256d_out); + __m256d_out = __lasx_cast_128_d (__m128d_op0); + ASSERTEQ_64 (__LINE__, __m256d_out, __m128d_op0); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-concat-128-256.c b/gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-concat-128-256.c new file mode 100644 index 000000000000..5d8cbb2e68f1 --- /dev/null +++ b/gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-concat-128-256.c @@ -0,0 +1,92 @@ +/* { dg-do compile { target { loongarch64*-*-* } } } */ +/* { dg-options "-mabi=lp64d -O2 -mlasx" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include <lasxintrin.h> + +/* +**foo1: +** vinsgr2vr.d (\$vr[0-9]+),\$r5,0 +** vinsgr2vr.d (\$vr[0-9]+),\$r7,0 +** vinsgr2vr.d (\$vr[0-9]+),\$r6,1 +** vinsgr2vr.d (\$vr[0-9]+),\$r8,1 +** xvpermi.q (\$xr[0-9]+),(\$xr[0-9]+),0x02 +** xvst (\$xr[0-9]+),\$r4,0 +** jr \$r1 +*/ +__m256 +foo1 (__m128 x, __m128 y) +{ + return __builtin_lasx_concat_128_s (x, y); +} + +/* +**foo2: +** vinsgr2vr.d (\$vr[0-9]+),\$r5,0 +** vinsgr2vr.d (\$vr[0-9]+),\$r7,0 +** vinsgr2vr.d (\$vr[0-9]+),\$r6,1 +** vinsgr2vr.d (\$vr[0-9]+),\$r8,1 +** xvpermi.q (\$xr[0-9]+),(\$xr[0-9]+),0x02 +** xvst (\$xr[0-9]+),\$r4,0 +** jr \$r1 +*/ +__m256d +foo2 (__m128d x, __m128d y) +{ + return __builtin_lasx_concat_128_d (x, y); +} + +/* +**foo3: +** vinsgr2vr.d (\$vr[0-9]+),\$r5,0 +** vinsgr2vr.d (\$vr[0-9]+),\$r7,0 +** vinsgr2vr.d (\$vr[0-9]+),\$r6,1 +** vinsgr2vr.d (\$vr[0-9]+),\$r8,1 +** xvpermi.q (\$xr[0-9]+),(\$xr[0-9]+),0x02 +** xvst (\$xr[0-9]+),\$r4,0 +** jr \$r1 +*/ +__m256i +foo3 (__m128i x, __m128i y) +{ + return __builtin_lasx_concat_128 (x, y); +} + +/* +**foo4: +** vinsgr2vr.d (\$vr[0-9]+),\$r5,0 +** vinsgr2vr.d (\$vr[0-9]+),\$r6,1 +** xvst (\$xr[0-9]+),\$r4,0 +** jr \$r1 +*/ +__m256 +foo4 (__m128 x) +{ + return __builtin_lasx_cast_128_s (x); +} + +/* +**foo5: +** vinsgr2vr.d (\$vr[0-9]+),\$r5,0 +** vinsgr2vr.d (\$vr[0-9]+),\$r6,1 +** xvst (\$xr[0-9]+),\$r4,0 +** jr \$r1 +*/ +__m256d +foo5 (__m128d x) +{ + return __builtin_lasx_cast_128_d (x); +} + +/* +**foo6: +** vinsgr2vr.d (\$vr[0-9]+),\$r5,0 +** vinsgr2vr.d (\$vr[0-9]+),\$r6,1 +** xvst (\$xr[0-9]+),\$r4,0 +** jr \$r1 +*/ +__m256i +foo6 (__m128i x) +{ + return __builtin_lasx_cast_128 (x); +} diff --git a/gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-extract-256-128-result.c b/gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-extract-256-128-result.c new file mode 100644 index 000000000000..61064d6e1b7c --- /dev/null +++ b/gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-extract-256-128-result.c @@ -0,0 +1,69 @@ +/* { dg-options "-mabi=lp64d -O2 -mlasx -w -fno-strict-aliasing" } */ + +#include "../simd_correctness_check.h" +#include <lasxintrin.h> + +extern void abort (void); +int +main () +{ + __m128i __m128i_result0, __m128i_result1, __m128i_out, __m128i_result; + __m128 __m128_result0, __m128_result1, __m128_out, __m128_result; + __m128d __m128d_result0, __m128d_result1, __m128d_out, __m128d_result; + + __m256i __m256i_op0, __m256i_op1, __m256i_op2, __m256i_out, __m256i_result; + __m256 __m256_op0, __m256_op1, __m256_op2, __m256_out, __m256_result; + __m256d __m256d_op0, __m256d_op1, __m256d_op2, __m256d_out, __m256d_result; + + //__m256_op0 = {1,2,3,4,5,6,7,8}; + *((int *)&__m256_op0[7]) = 0x41000000; + *((int *)&__m256_op0[6]) = 0x40e00000; + *((int *)&__m256_op0[5]) = 0x40c00000; + *((int *)&__m256_op0[4]) = 0x40a00000; + *((int *)&__m256_op0[3]) = 0x40800000; + *((int *)&__m256_op0[2]) = 0x40400000; + *((int *)&__m256_op0[1]) = 0x40000000; + *((int *)&__m256_op0[0]) = 0x3f800000; + *((int *)&__m128_result1[3]) = 0x41000000; + *((int *)&__m128_result1[2]) = 0x40e00000; + *((int *)&__m128_result1[1]) = 0x40c00000; + *((int *)&__m128_result1[0]) = 0x40a00000; + *((int *)&__m128_result0[3]) = 0x40800000; + *((int *)&__m128_result0[2]) = 0x40400000; + *((int *)&__m128_result0[1]) = 0x40000000; + *((int *)&__m128_result0[0]) = 0x3f800000; + __m128_out = __lasx_extract_128_lo_s (__m256_op0); + ASSERTEQ_32 (__LINE__, __m128_result0, __m128_out); + __m128_out = __lasx_extract_128_hi_s (__m256_op0); + ASSERTEQ_32 (__LINE__, __m128_result1, __m128_out); + + //__m256i_op0 = {1,2,3,4}; + *((unsigned long *)&__m256i_op0[3]) = 0x4; + *((unsigned long *)&__m256i_op0[2]) = 0x3; + *((unsigned long *)&__m256i_op0[1]) = 0x2; + *((unsigned long *)&__m256i_op0[0]) = 0x1; + *((unsigned long *)&__m128i_result0[1]) = 0x2; + *((unsigned long *)&__m128i_result0[0]) = 0x1; + *((unsigned long *)&__m128i_result1[1]) = 0x4; + *((unsigned long *)&__m128i_result1[0]) = 0x3; + __m128i_out = __lasx_extract_128_lo (__m256i_op0); + ASSERTEQ_64 (__LINE__, __m128i_result0, __m128i_out); + __m128i_out = __lasx_extract_128_hi (__m256i_op0); + ASSERTEQ_64 (__LINE__, __m128i_result1, __m128i_out); + + //__m256d_op0 = {1,2,3,4}; + *((unsigned long *)&__m256d_op0[3]) = 0x4010000000000000; + *((unsigned long *)&__m256d_op0[2]) = 0x4008000000000000; + *((unsigned long *)&__m256d_op0[1]) = 0x4000000000000000; + *((unsigned long *)&__m256d_op0[0]) = 0x3ff0000000000000; + *((unsigned long *)&__m128d_result0[1]) = 0x4000000000000000; + *((unsigned long *)&__m128d_result0[0]) = 0x3ff0000000000000; + *((unsigned long *)&__m128d_result1[1]) = 0x4010000000000000; + *((unsigned long *)&__m128d_result1[0]) = 0x4008000000000000; + __m128d_out = __lasx_extract_128_lo_d (__m256d_op0); + ASSERTEQ_64 (__LINE__, __m128d_result0, __m128d_out); + __m128d_out = __lasx_extract_128_hi_d (__m256d_op0); + ASSERTEQ_64 (__LINE__, __m128d_result1, __m128d_out); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-extract-256-128.c b/gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-extract-256-128.c new file mode 100644 index 000000000000..d2219ea82de2 --- /dev/null +++ b/gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-extract-256-128.c @@ -0,0 +1,86 @@ +/* { dg-do compile { target { loongarch64*-*-* } } } */ +/* { dg-options "-mabi=lp64d -O2 -mlasx" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include <lasxintrin.h> + +/* +**foo1_lo: +** vld (\$vr[0-9]+),\$r4,0 +** vpickve2gr.du \$r4,(\$vr[0-9]+),0 +** vpickve2gr.du \$r5,(\$vr[0-9]+),1 +** jr \$r1 +*/ +__m128 +foo1_lo (__m256 x) +{ + return __lasx_extract_128_lo_s (x); +} + +/* +**foo1_hi: +** xvld (\$xr[0-9]+),\$r4,0 +** xvpermi.d (\$xr[0-9]+),(\$xr[0-9]+),0xe +** vpickve2gr.du \$r4,(\$vr[0-9]+),0 +** vpickve2gr.du \$r5,(\$vr[0-9]+),1 +** jr \$r1 +*/ +__m128 +foo1_hi (__m256 x) +{ + return __lasx_extract_128_hi_s (x); +} + +/* +**foo2_lo: +** vld (\$vr[0-9]+),\$r4,0 +** vpickve2gr.du \$r4,(\$vr[0-9]+),0 +** vpickve2gr.du \$r5,(\$vr[0-9]+),1 +** jr \$r1 +*/ +__m128d +foo2_lo (__m256d x) +{ + return __lasx_extract_128_lo_d (x); +} + +/* +**foo2_hi: +** xvld (\$xr[0-9]+),\$r4,0 +** xvpermi.d (\$xr[0-9]+),(\$xr[0-9]+),0xe +** vpickve2gr.du \$r4,(\$vr[0-9]+),0 +** vpickve2gr.du \$r5,(\$vr[0-9]+),1 +** jr \$r1 +*/ +__m128d +foo2_hi (__m256d x) +{ + return __lasx_extract_128_hi_d (x); +} + +/* +**foo3_lo: +** vld (\$vr[0-9]+),\$r4,0 +** vpickve2gr.du \$r4,(\$vr[0-9]+),0 +** vpickve2gr.du \$r5,(\$vr[0-9]+),1 +** jr \$r1 +*/ +__m128i +foo3_lo (__m256i x) +{ + return __lasx_extract_128_lo (x); +} + +/* +**foo3_hi: +** xvld (\$xr[0-9]+),\$r4,0 +** xvpermi.d (\$xr[0-9]+),(\$xr[0-9]+),0xe +** vpickve2gr.du \$r4,(\$vr[0-9]+),0 +** vpickve2gr.du \$r5,(\$vr[0-9]+),1 +** jr \$r1 +*/ +__m128i +foo3_hi (__m256i x) +{ + return __lasx_extract_128_hi (x); +} diff --git a/gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-insert-128-256-result.c b/gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-insert-128-256-result.c new file mode 100644 index 000000000000..ce5abf94f4f1 --- /dev/null +++ b/gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-insert-128-256-result.c @@ -0,0 +1,97 @@ +/* { dg-options "-mabi=lp64d -O2 -mlasx -w -fno-strict-aliasing" } */ + +#include "../simd_correctness_check.h" +#include <lasxintrin.h> + +extern void abort (void); +int +main () +{ + __m128i __m128i_op0, __m128i_op1, __m128i_out; + __m128 __m128_op0, __m128_op1, __m128_out; + __m128d __m128d_op0, __m128d_op1, __m128d_out; + + __m256i __m256i_op0, __m256i_result0, __m256i_result1, __m256i_out; + __m256 __m256_op0, __m256_result0, __m256_result1, __m256_out; + __m256d __m256d_op0, __m256d_result0, __m256d_result1, __m256d_out; + + //__m256_op0 = {1,2,3,4,5,6,7,8}, __m128_op0 ={9,9,9,9}; + *((int *)&__m256_op0[7]) = 0x41000000; + *((int *)&__m256_op0[6]) = 0x40e00000; + *((int *)&__m256_op0[5]) = 0x40c00000; + *((int *)&__m256_op0[4]) = 0x40a00000; + *((int *)&__m256_op0[3]) = 0x40800000; + *((int *)&__m256_op0[2]) = 0x40400000; + *((int *)&__m256_op0[1]) = 0x40000000; + *((int *)&__m256_op0[0]) = 0x3f800000; + *((int *)&__m128_op0[3]) = 0x41100000; + *((int *)&__m128_op0[2]) = 0x41100000; + *((int *)&__m128_op0[1]) = 0x41100000; + *((int *)&__m128_op0[0]) = 0x41100000; + *((int *)&__m256_result0[7]) = 0x41000000; + *((int *)&__m256_result0[6]) = 0x40e00000; + *((int *)&__m256_result0[5]) = 0x40c00000; + *((int *)&__m256_result0[4]) = 0x40a00000; + *((int *)&__m256_result0[3]) = 0x41100000; + *((int *)&__m256_result0[2]) = 0x41100000; + *((int *)&__m256_result0[1]) = 0x41100000; + *((int *)&__m256_result0[0]) = 0x41100000; + *((int *)&__m256_result1[7]) = 0x41100000; + *((int *)&__m256_result1[6]) = 0x41100000; + *((int *)&__m256_result1[5]) = 0x41100000; + *((int *)&__m256_result1[4]) = 0x41100000; + *((int *)&__m256_result1[3]) = 0x40800000; + *((int *)&__m256_result1[2]) = 0x40400000; + *((int *)&__m256_result1[1]) = 0x40000000; + *((int *)&__m256_result1[0]) = 0x3f800000; + __m256_out = __lasx_insert_128_lo_s (__m256_op0, __m128_op0); + ASSERTEQ_32 (__LINE__, __m256_result0, __m256_out); + __m256_out = __lasx_insert_128_hi_s (__m256_op0, __m128_op0); + ASSERTEQ_32 (__LINE__, __m256_result1, __m256_out); + + //__m256i_op0 ={1,2,3,4},__m128i_op0={5,6},__m128i_op1={7,8}; + *((unsigned long *)&__m256i_op0[3]) = 0x4; + *((unsigned long *)&__m256i_op0[2]) = 0x3; + *((unsigned long *)&__m256i_op0[1]) = 0x2; + *((unsigned long *)&__m256i_op0[0]) = 0x1; + *((unsigned long *)&__m128i_op0[1]) = 0x6; + *((unsigned long *)&__m128i_op0[0]) = 0x5; + *((unsigned long *)&__m128i_op1[1]) = 0x8; + *((unsigned long *)&__m128i_op1[0]) = 0x7; + *((unsigned long *)&__m256i_result0[3]) = 0x4; + *((unsigned long *)&__m256i_result0[2]) = 0x3; + *((unsigned long *)&__m256i_result0[1]) = 0x6; + *((unsigned long *)&__m256i_result0[0]) = 0x5; + *((unsigned long *)&__m256i_result1[3]) = 0x8; + *((unsigned long *)&__m256i_result1[2]) = 0x7; + *((unsigned long *)&__m256i_result1[1]) = 0x2; + *((unsigned long *)&__m256i_result1[0]) = 0x1; + __m256i_out = __lasx_insert_128_lo (__m256i_op0, __m128i_op0); + ASSERTEQ_64 (__LINE__, __m256i_result0, __m256i_out); + __m256i_out = __lasx_insert_128_hi (__m256i_op0, __m128i_op1); + ASSERTEQ_64 (__LINE__, __m256i_result1, __m256i_out); + + //__m256d_op0 ={1,2,3,4},__m128d_op0={5,6},__m128d_op1={7,8}; + *((unsigned long *)&__m256d_op0[3]) = 0x4010000000000000; + *((unsigned long *)&__m256d_op0[2]) = 0x4008000000000000; + *((unsigned long *)&__m256d_op0[1]) = 0x4000000000000000; + *((unsigned long *)&__m256d_op0[0]) = 0x3ff0000000000000; + *((unsigned long *)&__m128d_op0[1]) = 0x4018000000000000; + *((unsigned long *)&__m128d_op0[0]) = 0x4014000000000000; + *((unsigned long *)&__m128d_op1[1]) = 0x4020000000000000; + *((unsigned long *)&__m128d_op1[0]) = 0x401c000000000000; + *((unsigned long *)&__m256d_result0[3]) = 0x4010000000000000; + *((unsigned long *)&__m256d_result0[2]) = 0x4008000000000000; + *((unsigned long *)&__m256d_result0[1]) = 0x4018000000000000; + *((unsigned long *)&__m256d_result0[0]) = 0x4014000000000000; + *((unsigned long *)&__m256d_result1[3]) = 0x4020000000000000; + *((unsigned long *)&__m256d_result1[2]) = 0x401c000000000000; + *((unsigned long *)&__m256d_result1[1]) = 0x4000000000000000; + *((unsigned long *)&__m256d_result1[0]) = 0x3ff0000000000000; + __m256d_out = __lasx_insert_128_lo_d (__m256d_op0, __m128d_op0); + ASSERTEQ_64 (__LINE__, __m256d_result0, __m256d_out); + __m256d_out = __lasx_insert_128_hi_d (__m256d_op0, __m128d_op1); + ASSERTEQ_64 (__LINE__, __m256d_result1, __m256d_out); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-insert-128-256.c b/gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-insert-128-256.c new file mode 100644 index 000000000000..326c855f229f --- /dev/null +++ b/gcc/testsuite/gcc.target/loongarch/vector/lasx/vect-insert-128-256.c @@ -0,0 +1,95 @@ +/* { dg-do compile { target { loongarch64*-*-* } } } */ +/* { dg-options "-mabi=lp64d -O2 -mlasx" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include <lasxintrin.h> + +/* +**foo1: +** vinsgr2vr.d (\$vr[0-9]+),\$r6,0 +** xvld (\$xr[0-9]+),\$r5,0 +** vinsgr2vr.d (\$vr[0-9]+),\$r7,1 +** xvpermi.q (\$xr[0-9]+),(\$xr[0-9]+),0x30 +** xvst (\$xr[0-9]+),\$r4,0 +** jr \$r1 +*/ +__m256 +foo1 (__m256 x, __m128 y) +{ + return __builtin_lasx_insert_128_lo_s (x, y); +} + +/* +**foo2: +** vinsgr2vr.d (\$vr[0-9]+),\$r6,0 +** xvld (\$xr[0-9]+),\$r5,0 +** vinsgr2vr.d (\$vr[0-9]+),\$r7,1 +** xvpermi.q (\$xr[0-9]+),(\$xr[0-9]+),0x02 +** xvst (\$xr[0-9]+),\$r4,0 +** jr \$r1 +*/ +__m256 +foo2 (__m256 x, __m128 y) +{ + return __builtin_lasx_insert_128_hi_s (x, y); +} + +/* +**foo3: +** vinsgr2vr.d (\$vr[0-9]+),\$r6,0 +** xvld (\$xr[0-9]+),\$r5,0 +** vinsgr2vr.d (\$vr[0-9]+),\$r7,1 +** xvpermi.q (\$xr[0-9]+),(\$xr[0-9]+),0x30 +** xvst (\$xr[0-9]+),\$r4,0 +** jr \$r1 +*/ +__m256d +foo3 (__m256d x, __m128d y) +{ + return __builtin_lasx_insert_128_lo_d (x, y); +} + +/* +**foo4: +** vinsgr2vr.d (\$vr[0-9]+),\$r6,0 +** xvld (\$xr[0-9]+),\$r5,0 +** vinsgr2vr.d (\$vr[0-9]+),\$r7,1 +** xvpermi.q (\$xr[0-9]+),(\$xr[0-9]+),0x02 +** xvst (\$xr[0-9]+),\$r4,0 +** jr \$r1 +*/ +__m256d +foo4 (__m256d x, __m128d y) +{ + return __builtin_lasx_insert_128_hi_d (x, y); +} + +/* +**foo5: +** vinsgr2vr.d (\$vr[0-9]+),\$r6,0 +** xvld (\$xr[0-9]+),\$r5,0 +** vinsgr2vr.d (\$vr[0-9]+),\$r7,1 +** xvpermi.q (\$xr[0-9]+),(\$xr[0-9]+),0x30 +** xvst (\$xr[0-9]+),\$r4,0 +** jr \$r1 +*/ +__m256i +foo5 (__m256i x, __m128i y) +{ + return __builtin_lasx_insert_128_lo (x, y); +} + +/* +**foo6: +** vinsgr2vr.d (\$vr[0-9]+),\$r6,0 +** xvld (\$xr[0-9]+),\$r5,0 +** vinsgr2vr.d (\$vr[0-9]+),\$r7,1 +** xvpermi.q (\$xr[0-9]+),(\$xr[0-9]+),0x02 +** xvst (\$xr[0-9]+),\$r4,0 +** jr \$r1 +*/ +__m256i +foo6 (__m256i x, __m128i y) +{ + return __builtin_lasx_insert_128_hi (x, y); +}
