https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85048
Bug ID: 85048 Summary: [missed optimization] vector conversions Product: gcc Version: 8.0.1 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: kretz at kde dot org Target Milestone: --- The following testcase lists all integer and/or float conversions applied to vector builtins of the same number of elements. All of those functions can be compiled to a single instruction (the function's name plus `ret`) when `-march=skylake-avx512` is active. AFAICS many conversion instructions in the SSE and AVX ISA extensions are also unsupported. I would expect this code to compile to optimal conversion sequences even on -O2 (and lower) since the conversion is applied directly on vector builtins. If this is not in scope, I'd like to open a feature request for something like clang's __builtin_convertvector (could be even done via static_cast) that produces optimal conversion instruction sequences on vector builtins without the auto-vectorizer. #include <cstdint> template <class T, int N, int Size = N * sizeof(T)> using V [[gnu::vector_size(Size)]] = T; template <class From, class To> V<To, 2> cvt2(V<From, 2> x) { return V<To, 2>{To(x[0]), To(x[1])}; } template <class From, class To> V<To, 4> cvt4(V<From, 4> x) { return V<To, 4>{To(x[0]), To(x[1]), To(x[2]), To(x[3])}; } template <class From, class To> V<To, 8> cvt8(V<From, 8> x) { return V<To, 8>{ To(x[0]), To(x[1]), To(x[2]), To(x[3]), To(x[4]), To(x[5]), To(x[6]), To(x[7]) }; } template <class From, class To> V<To, 16> cvt16(V<From, 16> x) { return V<To, 16>{ To(x[0]), To(x[1]), To(x[2]), To(x[3]), To(x[4]), To(x[5]), To(x[6]), To(x[7]), To(x[8]), To(x[9]), To(x[10]), To(x[11]), To(x[12]), To(x[13]), To(x[14]), To(x[15]) }; } template <class From, class To> V<To, 32> cvt32(V<From, 32> x) { return V<To, 32>{ To(x[0]), To(x[1]), To(x[2]), To(x[3]), To(x[4]), To(x[5]), To(x[6]), To(x[7]), To(x[8]), To(x[9]), To(x[10]), To(x[11]), To(x[12]), To(x[13]), To(x[14]), To(x[15]), To(x[16]), To(x[17]), To(x[18]), To(x[19]), To(x[20]), To(x[21]), To(x[22]), To(x[23]), To(x[24]), To(x[25]), To(x[26]), To(x[27]), To(x[28]), To(x[29]), To(x[30]), To(x[31]) }; } template <class From, class To> V<To, 64> cvt64(V<From, 64> x) { return V<To, 64>{ To(x[ 0]), To(x[ 1]), To(x[ 2]), To(x[ 3]), To(x[ 4]), To(x[ 5]), To(x[ 6]), To(x[ 7]), To(x[ 8]), To(x[ 9]), To(x[10]), To(x[11]), To(x[12]), To(x[13]), To(x[14]), To(x[15]), To(x[16]), To(x[17]), To(x[18]), To(x[19]), To(x[20]), To(x[21]), To(x[22]), To(x[23]), To(x[24]), To(x[25]), To(x[26]), To(x[27]), To(x[28]), To(x[29]), To(x[30]), To(x[31]), To(x[32]), To(x[33]), To(x[34]), To(x[35]), To(x[36]), To(x[37]), To(x[38]), To(x[39]), To(x[40]), To(x[41]), To(x[42]), To(x[43]), To(x[44]), To(x[45]), To(x[46]), To(x[47]), To(x[48]), To(x[49]), To(x[50]), To(x[51]), To(x[52]), To(x[53]), To(x[54]), To(x[55]), To(x[56]), To(x[57]), To(x[58]), To(x[59]), To(x[60]), To(x[61]), To(x[62]), To(x[63]), }; } #define _(name, from, to, size) \ auto name(V<from, size> x) { return cvt##size<from, to>(x); } // integral -> integral; truncation _(vpmovqd , uint64_t, uint32_t, 2) _(vpmovqd , uint64_t, uint32_t, 4) _(vpmovqd , uint64_t, uint32_t, 8) _(vpmovqd , int64_t, uint32_t, 2) _(vpmovqd , int64_t, uint32_t, 4) _(vpmovqd , int64_t, uint32_t, 8) _(vpmovqd_, uint64_t, int32_t, 2) _(vpmovqd_, uint64_t, int32_t, 4) _(vpmovqd_, uint64_t, int32_t, 8) _(vpmovqd_, int64_t, int32_t, 2) _(vpmovqd_, int64_t, int32_t, 4) _(vpmovqd_, int64_t, int32_t, 8) _(vpmovqw , uint64_t, uint16_t, 2) _(vpmovqw , uint64_t, uint16_t, 4) _(vpmovqw , uint64_t, uint16_t, 8) _(vpmovqw , int64_t, uint16_t, 2) _(vpmovqw , int64_t, uint16_t, 4) _(vpmovqw , int64_t, uint16_t, 8) _(vpmovqw_, uint64_t, int16_t, 2) _(vpmovqw_, uint64_t, int16_t, 4) _(vpmovqw_, uint64_t, int16_t, 8) _(vpmovqw_, int64_t, int16_t, 2) _(vpmovqw_, int64_t, int16_t, 4) _(vpmovqw_, int64_t, int16_t, 8) _(vpmovqb , uint64_t, uint8_t, 2) _(vpmovqb , uint64_t, uint8_t, 4) _(vpmovqb , uint64_t, uint8_t, 8) _(vpmovqb , int64_t, uint8_t, 2) _(vpmovqb , int64_t, uint8_t, 4) _(vpmovqb , int64_t, uint8_t, 8) _(vpmovqb_, uint64_t, int8_t, 2) _(vpmovqb_, uint64_t, int8_t, 4) _(vpmovqb_, uint64_t, int8_t, 8) _(vpmovqb_, int64_t, int8_t, 2) _(vpmovqb_, int64_t, int8_t, 4) _(vpmovqb_, int64_t, int8_t, 8) _(vpmovdw , uint32_t, uint16_t, 4) _(vpmovdw , uint32_t, uint16_t, 8) _(vpmovdw , uint32_t, uint16_t, 16) _(vpmovdw , int32_t, uint16_t, 4) _(vpmovdw , int32_t, uint16_t, 8) _(vpmovdw , int32_t, uint16_t, 16) _(vpmovdw_, uint32_t, int16_t, 4) _(vpmovdw_, uint32_t, int16_t, 8) _(vpmovdw_, uint32_t, int16_t, 16) _(vpmovdw_, int32_t, int16_t, 4) _(vpmovdw_, int32_t, int16_t, 8) _(vpmovdw_, int32_t, int16_t, 16) _(vpmovdb , uint32_t, uint8_t, 4) _(vpmovdb , uint32_t, uint8_t, 8) _(vpmovdb , uint32_t, uint8_t, 16) _(vpmovdb , int32_t, uint8_t, 4) _(vpmovdb , int32_t, uint8_t, 8) _(vpmovdb , int32_t, uint8_t, 16) _(vpmovdb_, uint32_t, int8_t, 4) _(vpmovdb_, uint32_t, int8_t, 8) _(vpmovdb_, uint32_t, int8_t, 16) _(vpmovdb_, int32_t, int8_t, 4) _(vpmovdb_, int32_t, int8_t, 8) _(vpmovdb_, int32_t, int8_t, 16) _(vpmovwb , uint16_t, uint8_t, 8) _(vpmovwb , uint16_t, uint8_t, 16) _(vpmovwb , uint16_t, uint8_t, 32) _(vpmovwb , int16_t, uint8_t, 8) _(vpmovwb , int16_t, uint8_t, 16) _(vpmovwb , int16_t, uint8_t, 32) _(vpmovwb_, uint16_t, int8_t, 8) _(vpmovwb_, uint16_t, int8_t, 16) _(vpmovwb_, uint16_t, int8_t, 32) _(vpmovwb_, int16_t, int8_t, 8) _(vpmovwb_, int16_t, int8_t, 16) _(vpmovwb_, int16_t, int8_t, 32) // integral -> integral; zero extension _(vpmovzxbw , uint8_t, int16_t, 8) _(vpmovzxbw , uint8_t, int16_t, 16) _(vpmovzxbw , uint8_t, int16_t, 32) _(vpmovzxbw_, uint8_t, uint16_t, 8) _(vpmovzxbw_, uint8_t, uint16_t, 16) _(vpmovzxbw_, uint8_t, uint16_t, 32) _(vpmovzxbd , uint8_t, int32_t, 4) _(vpmovzxbd , uint8_t, int32_t, 8) _(vpmovzxbd , uint8_t, int32_t, 16) _(vpmovzxwd , uint16_t, int32_t, 4) _(vpmovzxwd , uint16_t, int32_t, 8) _(vpmovzxwd , uint16_t, int32_t, 16) _(vpmovzxbd_, uint8_t, uint32_t, 4) _(vpmovzxbd_, uint8_t, uint32_t, 8) _(vpmovzxbd_, uint8_t, uint32_t, 16) _(vpmovzxwd_, uint16_t, uint32_t, 4) _(vpmovzxwd_, uint16_t, uint32_t, 8) _(vpmovzxwd_, uint16_t, uint32_t, 16) _(vpmovzxbq , uint8_t, int64_t, 2) _(vpmovzxbq , uint8_t, int64_t, 4) _(vpmovzxbq , uint8_t, int64_t, 8) _(vpmovzxwq , uint16_t, int64_t, 2) _(vpmovzxwq , uint16_t, int64_t, 4) _(vpmovzxwq , uint16_t, int64_t, 8) _(vpmovzxdq , uint32_t, int64_t, 2) _(vpmovzxdq , uint32_t, int64_t, 4) _(vpmovzxdq , uint32_t, int64_t, 8) _(vpmovzxbq_, uint8_t, uint64_t, 2) _(vpmovzxbq_, uint8_t, uint64_t, 4) _(vpmovzxbq_, uint8_t, uint64_t, 8) _(vpmovzxwq_, uint16_t, uint64_t, 2) _(vpmovzxwq_, uint16_t, uint64_t, 4) _(vpmovzxwq_, uint16_t, uint64_t, 8) _(vpmovzxdq_, uint32_t, uint64_t, 2) _(vpmovzxdq_, uint32_t, uint64_t, 4) _(vpmovzxdq_, uint32_t, uint64_t, 8) // integral -> integral; sign extension _(vpmovsxbw , int8_t, int16_t, 8) _(vpmovsxbw , int8_t, int16_t, 16) _(vpmovsxbw , int8_t, int16_t, 32) _(vpmovsxbw_, int8_t, uint16_t, 8) _(vpmovsxbw_, int8_t, uint16_t, 16) _(vpmovsxbw_, int8_t, uint16_t, 32) _(vpmovsxbd , int8_t, int32_t, 4) _(vpmovsxbd , int8_t, int32_t, 8) _(vpmovsxbd , int8_t, int32_t, 16) _(vpmovsxwd , int16_t, int32_t, 4) _(vpmovsxwd , int16_t, int32_t, 8) _(vpmovsxwd , int16_t, int32_t, 16) _(vpmovsxbd_, int8_t, uint32_t, 4) _(vpmovsxbd_, int8_t, uint32_t, 8) _(vpmovsxbd_, int8_t, uint32_t, 16) _(vpmovsxwd_, int16_t, uint32_t, 4) _(vpmovsxwd_, int16_t, uint32_t, 8) _(vpmovsxwd_, int16_t, uint32_t, 16) _(vpmovsxbq , int8_t, int64_t, 2) _(vpmovsxbq , int8_t, int64_t, 4) _(vpmovsxbq , int8_t, int64_t, 8) _(vpmovsxwq , int16_t, int64_t, 2) _(vpmovsxwq , int16_t, int64_t, 4) _(vpmovsxwq , int16_t, int64_t, 8) _(vpmovsxdq , int32_t, int64_t, 2) _(vpmovsxdq , int32_t, int64_t, 4) _(vpmovsxdq , int32_t, int64_t, 8) _(vpmovsxbq_, int8_t, uint64_t, 2) _(vpmovsxbq_, int8_t, uint64_t, 4) _(vpmovsxbq_, int8_t, uint64_t, 8) _(vpmovsxwq_, int16_t, uint64_t, 2) _(vpmovsxwq_, int16_t, uint64_t, 4) _(vpmovsxwq_, int16_t, uint64_t, 8) _(vpmovsxdq_, int32_t, uint64_t, 2) _(vpmovsxdq_, int32_t, uint64_t, 4) _(vpmovsxdq_, int32_t, uint64_t, 8) // integral -> double _(vcvtdq2pd , int32_t, double, 2) _(vcvtdq2pd , int32_t, double, 4) _(vcvtdq2pd , int32_t, double, 8) _(vcvtudq2pd, uint32_t, double, 2) _(vcvtudq2pd, uint32_t, double, 4) _(vcvtudq2pd, uint32_t, double, 8) _(vcvtqq2pd , int64_t, double, 2) _(vcvtqq2pd , int64_t, double, 4) _(vcvtqq2pd , int64_t, double, 8) _(vcvtuqq2pd, uint64_t, double, 2) _(vcvtuqq2pd, uint64_t, double, 4) _(vcvtuqq2pd, uint64_t, double, 8) // integral -> float _(vcvtdq2ps , int32_t, float, 4) _(vcvtdq2ps , int32_t, float, 8) _(vcvtdq2ps , int32_t, float, 16) _(vcvtudq2ps, uint32_t, float, 4) _(vcvtudq2ps, uint32_t, float, 8) _(vcvtudq2ps, uint32_t, float, 16) _(vcvtqq2ps , int64_t, float, 4) _(vcvtqq2ps , int64_t, float, 8) _(vcvtqq2ps , int64_t, float, 16) _(vcvtuqq2ps, uint64_t, float, 4) _(vcvtuqq2ps, uint64_t, float, 8) _(vcvtuqq2ps, uint64_t, float, 16) // float <-> double _( cvttpd2ps, double, float, 2) _(vcvttpd2ps, double, float, 4) _(vcvttpd2ps, double, float, 8) _( cvttps2pd, float, double, 2) _(vcvttps2pd, float, double, 4) _(vcvttps2pd, float, double, 8) // float -> integral _( cvttps2dq, float, int32_t, 4) _(vcvttps2dq, float, int32_t, 8) _(vcvttps2dq, float, int32_t, 16) _( cvttps2qq, float, int64_t, 4) _(vcvttps2qq, float, int64_t, 8) _(vcvttps2qq, float, int64_t, 16) _( cvttps2udq, float, uint32_t, 4) _(vcvttps2udq, float, uint32_t, 8) _(vcvttps2udq, float, uint32_t, 16) _( cvttps2uqq, float, uint64_t, 4) _(vcvttps2uqq, float, uint64_t, 8) _(vcvttps2uqq, float, uint64_t, 16) // double -> integral _( cvttpd2dq, double, int32_t, 2) _(vcvttpd2dq, double, int32_t, 4) _(vcvttpd2dq, double, int32_t, 8) _(vcvttpd2qq, double, int64_t, 2) _(vcvttpd2qq, double, int64_t, 4) _(vcvttpd2qq, double, int64_t, 8) _(vcvttpd2udq, double, uint32_t, 2) _(vcvttpd2udq, double, uint32_t, 4) _(vcvttpd2udq, double, uint32_t, 8) _(vcvttpd2uqq, double, uint64_t, 2) _(vcvttpd2uqq, double, uint64_t, 4) _(vcvttpd2uqq, double, uint64_t, 8) // no change in type; nop _(nop, int8_t, int8_t, 16) _(nop, uint8_t, uint8_t, 16) _(nop, int8_t, int8_t, 32) _(nop, uint8_t, uint8_t, 32) _(nop, int8_t, int8_t, 64) _(nop, uint8_t, uint8_t, 64) _(nop, int16_t, int16_t, 8) _(nop, uint16_t, uint16_t, 8) _(nop, int16_t, int16_t, 16) _(nop, uint16_t, uint16_t, 16) _(nop, int16_t, int16_t, 32) _(nop, uint16_t, uint16_t, 32) _(nop, int32_t, int32_t, 4) _(nop, uint32_t, uint32_t, 4) _(nop, int32_t, int32_t, 8) _(nop, uint32_t, uint32_t, 8) _(nop, int32_t, int32_t, 16) _(nop, uint32_t, uint32_t, 16) _(nop, int64_t, int64_t, 2) _(nop, uint64_t, uint64_t, 2) _(nop, int64_t, int64_t, 4) _(nop, uint64_t, uint64_t, 4) _(nop, int64_t, int64_t, 8) _(nop, uint64_t, uint64_t, 8) _(nop, double, double, 2) _(nop, double, double, 4) _(nop, double, double, 8) _(nop, float, float, 4) _(nop, float, float, 8) _(nop, float, float, 16)