llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-clang Author: Simon Pilgrim (RKSimon) <details> <summary>Changes</summary> As noted on #<!-- -->156494 and #<!-- -->157403 - its much easier to work with the byte shift intrinsics if we treat them as vXi8 types instead of vXi64 types which will require bitcasting We already do this for the PALIGNR intrinsics which are a more advanced version of the same shuffle --- Full diff: https://github.com/llvm/llvm-project/pull/158671.diff 8 Files Affected: - (modified) clang/include/clang/Basic/BuiltinsX86.td (+6-6) - (modified) clang/lib/CodeGen/TargetBuiltins/X86.cpp (+18-24) - (modified) clang/lib/Headers/avx2intrin.h (+4-4) - (modified) clang/lib/Headers/avx512bwintrin.h (+2-2) - (modified) clang/lib/Headers/emmintrin.h (+4-4) - (modified) clang/lib/Headers/tmmintrin.h (+6-5) - (modified) clang/lib/Headers/xmmintrin.h (+2-2) - (modified) clang/test/CodeGen/X86/sse.c (+2) ``````````diff diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index dd7727a39f693..aac502091b57e 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -277,8 +277,8 @@ let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] i def pslld128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">; def psllq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">; def pmaddwd128 : X86Builtin<"_Vector<4, int>(_Vector<8, short>, _Vector<8, short>)">; - def pslldqi128_byteshift : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Constant int)">; - def psrldqi128_byteshift : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Constant int)">; + def pslldqi128_byteshift : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant int)">; + def psrldqi128_byteshift : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant int)">; } let Features = "sse2", @@ -594,12 +594,12 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i def psignw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">; def psignd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">; def psllw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">; - def pslldqi256_byteshift : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">; + def pslldqi256_byteshift : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Constant int)">; def pslld256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">; def psllq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>)">; def psraw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">; def psrad256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">; - def psrldqi256_byteshift : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">; + def psrldqi256_byteshift : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Constant int)">; def psrlw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">; def psrld256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">; def psrlq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>)">; @@ -2052,8 +2052,8 @@ let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<8, short>)">; def psrlw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<8, short>)">; - def pslldqi512_byteshift : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Constant int)">; - def psrldqi512_byteshift : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Constant int)">; + def pslldqi512_byteshift : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Constant int)">; + def psrldqi512_byteshift : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Constant int)">; } let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in { diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp index a4974e45caf10..b924407b6ddd7 100644 --- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp @@ -1814,59 +1814,53 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_pslldqi256_byteshift: case X86::BI__builtin_ia32_pslldqi512_byteshift: { unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff; - auto *ResultType = cast<llvm::FixedVectorType>(Ops[0]->getType()); - // Builtin type is vXi64 so multiply by 8 to get bytes. - unsigned NumElts = ResultType->getNumElements() * 8; + auto *VecTy = cast<llvm::FixedVectorType>(Ops[0]->getType()); + // Builtin type is vXi8. + unsigned NumElts = VecTy->getNumElements(); + Value *Zero = llvm::Constant::getNullValue(VecTy); // If pslldq is shifting the vector more than 15 bytes, emit zero. if (ShiftVal >= 16) - return llvm::Constant::getNullValue(ResultType); + return Zero; int Indices[64]; // 256/512-bit pslldq operates on 128-bit lanes so we need to handle that for (unsigned l = 0; l != NumElts; l += 16) { for (unsigned i = 0; i != 16; ++i) { unsigned Idx = NumElts + i - ShiftVal; - if (Idx < NumElts) Idx -= NumElts - 16; // end of lane, switch operand. + if (Idx < NumElts) + Idx -= NumElts - 16; // end of lane, switch operand. Indices[l + i] = Idx + l; } } - - auto *VecTy = llvm::FixedVectorType::get(Int8Ty, NumElts); - Value *Cast = Builder.CreateBitCast(Ops[0], VecTy, "cast"); - Value *Zero = llvm::Constant::getNullValue(VecTy); - Value *SV = Builder.CreateShuffleVector( - Zero, Cast, ArrayRef(Indices, NumElts), "pslldq"); - return Builder.CreateBitCast(SV, Ops[0]->getType(), "cast"); + return Builder.CreateShuffleVector(Zero, Ops[0], ArrayRef(Indices, NumElts), + "pslldq"); } case X86::BI__builtin_ia32_psrldqi128_byteshift: case X86::BI__builtin_ia32_psrldqi256_byteshift: case X86::BI__builtin_ia32_psrldqi512_byteshift: { unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff; - auto *ResultType = cast<llvm::FixedVectorType>(Ops[0]->getType()); - // Builtin type is vXi64 so multiply by 8 to get bytes. - unsigned NumElts = ResultType->getNumElements() * 8; + auto *VecTy = cast<llvm::FixedVectorType>(Ops[0]->getType()); + // Builtin type is vXi8. + unsigned NumElts = VecTy->getNumElements(); + Value *Zero = llvm::Constant::getNullValue(VecTy); // If psrldq is shifting the vector more than 15 bytes, emit zero. if (ShiftVal >= 16) - return llvm::Constant::getNullValue(ResultType); + return Zero; int Indices[64]; // 256/512-bit psrldq operates on 128-bit lanes so we need to handle that for (unsigned l = 0; l != NumElts; l += 16) { for (unsigned i = 0; i != 16; ++i) { unsigned Idx = i + ShiftVal; - if (Idx >= 16) Idx += NumElts - 16; // end of lane, switch operand. + if (Idx >= 16) + Idx += NumElts - 16; // end of lane, switch operand. Indices[l + i] = Idx + l; } } - - auto *VecTy = llvm::FixedVectorType::get(Int8Ty, NumElts); - Value *Cast = Builder.CreateBitCast(Ops[0], VecTy, "cast"); - Value *Zero = llvm::Constant::getNullValue(VecTy); - Value *SV = Builder.CreateShuffleVector( - Cast, Zero, ArrayRef(Indices, NumElts), "psrldq"); - return Builder.CreateBitCast(SV, ResultType, "cast"); + return Builder.CreateShuffleVector(Ops[0], Zero, ArrayRef(Indices, NumElts), + "psrldq"); } case X86::BI__builtin_ia32_kshiftliqi: case X86::BI__builtin_ia32_kshiftlihi: diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h index fc12a9bf15e57..4743970e7fbf0 100644 --- a/clang/lib/Headers/avx2intrin.h +++ b/clang/lib/Headers/avx2intrin.h @@ -2061,7 +2061,7 @@ _mm256_sign_epi32(__m256i __a, __m256i __b) /// An unsigned immediate value specifying the shift count (in bytes). /// \returns A 256-bit integer vector containing the result. #define _mm256_slli_si256(a, imm) \ - ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))) + ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v32qi)(__m256i)(a), (int)(imm))) /// Shifts each 128-bit half of the 256-bit integer vector \a a left by /// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm @@ -2081,7 +2081,7 @@ _mm256_sign_epi32(__m256i __a, __m256i __b) /// An unsigned immediate value specifying the shift count (in bytes). /// \returns A 256-bit integer vector containing the result. #define _mm256_bslli_epi128(a, imm) \ - ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))) + ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v32qi)(__m256i)(a), (int)(imm))) /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a /// left by \a __count bits, shifting in zero bits, and returns the result. @@ -2300,7 +2300,7 @@ _mm256_sra_epi32(__m256i __a, __m128i __count) /// An unsigned immediate value specifying the shift count (in bytes). /// \returns A 256-bit integer vector containing the result. #define _mm256_srli_si256(a, imm) \ - ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))) + ((__m256i)__builtin_ia32_psrldqi256_byteshift((__v32qi)(__m256i)(a), (int)(imm))) /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by /// \a imm bytes, shifting in zero bytes, and returns the result. If @@ -2320,7 +2320,7 @@ _mm256_sra_epi32(__m256i __a, __m128i __count) /// An unsigned immediate value specifying the shift count (in bytes). /// \returns A 256-bit integer vector containing the result. #define _mm256_bsrli_epi128(a, imm) \ - ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))) + ((__m256i)__builtin_ia32_psrldqi256_byteshift((__v32qi)(__m256i)(a), (int)(imm))) /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a /// right by \a __count bits, shifting in zero bits, and returns the result. diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h index 77820a2ca041c..27dbcff8c21a3 100644 --- a/clang/lib/Headers/avx512bwintrin.h +++ b/clang/lib/Headers/avx512bwintrin.h @@ -1460,7 +1460,7 @@ _mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, unsigned int __B) { } #define _mm512_bslli_epi128(a, imm) \ - ((__m512i)__builtin_ia32_pslldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm))) + ((__m512i)__builtin_ia32_pslldqi512_byteshift((__v64qi)(__m512i)(a), (int)(imm))) static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_srlv_epi16(__m512i __A, __m512i __B) @@ -1591,7 +1591,7 @@ _mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B) { } #define _mm512_bsrli_epi128(a, imm) \ - ((__m512i)__builtin_ia32_psrldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm))) + ((__m512i)__builtin_ia32_psrldqi512_byteshift((__v64qi)(__m512i)(a), (int)(imm))) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A) diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h index e4fbe011239d6..12260ec6ea14c 100644 --- a/clang/lib/Headers/emmintrin.h +++ b/clang/lib/Headers/emmintrin.h @@ -2745,11 +2745,11 @@ _mm_xor_si128(__m128i __a, __m128i __b) { /// \a a. /// \returns A 128-bit integer vector containing the left-shifted value. #define _mm_slli_si128(a, imm) \ - ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \ + ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v16qi)(__m128i)(a), \ (int)(imm))) #define _mm_bslli_si128(a, imm) \ - ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \ + ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v16qi)(__m128i)(a), \ (int)(imm))) /// Left-shifts each 16-bit value in the 128-bit integer vector operand @@ -2954,11 +2954,11 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a, /// \a a. /// \returns A 128-bit integer vector containing the right-shifted value. #define _mm_srli_si128(a, imm) \ - ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \ + ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v16qi)(__m128i)(a), \ (int)(imm))) #define _mm_bsrli_si128(a, imm) \ - ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \ + ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v16qi)(__m128i)(a), \ (int)(imm))) /// Right-shifts each of 16-bit values in the 128-bit integer vector diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h index 6e7107e36ea79..d40f0c56b2c5a 100644 --- a/clang/lib/Headers/tmmintrin.h +++ b/clang/lib/Headers/tmmintrin.h @@ -175,11 +175,12 @@ _mm_abs_epi32(__m128i __a) { /// An immediate operand specifying how many bytes to right-shift the result. /// \returns A 64-bit integer vector containing the concatenated right-shifted /// value. -#define _mm_alignr_pi8(a, b, n) \ - ((__m64)__builtin_shufflevector( \ - __builtin_ia32_psrldqi128_byteshift( \ - __builtin_shufflevector((__v1di)(a), (__v1di)(b), 1, 0), \ - (n)), __extension__ (__v2di){}, 0)) +#define _mm_alignr_pi8(a, b, n) \ + ((__m64)__builtin_shufflevector( \ + (__v2di)__builtin_ia32_psrldqi128_byteshift( \ + (__v16qi)__builtin_shufflevector((__v1di)(a), (__v1di)(b), 1, 0), \ + (n)), \ + __extension__(__v2di){}, 0)) /// Horizontally adds the adjacent pairs of values contained in 2 packed /// 128-bit vectors of [8 x i16]. diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h index 6b70f245e2564..4891e3ce077b5 100644 --- a/clang/lib/Headers/xmmintrin.h +++ b/clang/lib/Headers/xmmintrin.h @@ -2520,8 +2520,8 @@ _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p) // If there's a risk of spurious trap due to a 128-bit write, back up the // pointer by 8 bytes and shift values in registers to match. __p -= 8; - __d128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__d128, 8); - __n128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__n128, 8); + __d128 = (__m128i)__builtin_ia32_pslldqi128_byteshift((__v16qi)__d128, 8); + __n128 = (__m128i)__builtin_ia32_pslldqi128_byteshift((__v16qi)__n128, 8); } __builtin_ia32_maskmovdqu((__v16qi)__d128, (__v16qi)__n128, __p); diff --git a/clang/test/CodeGen/X86/sse.c b/clang/test/CodeGen/X86/sse.c index 017bdd7846fa3..38cc7179543d5 100644 --- a/clang/test/CodeGen/X86/sse.c +++ b/clang/test/CodeGen/X86/sse.c @@ -32,6 +32,7 @@ __m128i test_mm_slli_si128_0(__m128i a) { // CHECK-LABEL: define dso_local <2 x i64> @test_mm_slli_si128_16( // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CAST:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> zeroinitializer // __m128i test_mm_slli_si128_16(__m128i a) { @@ -65,6 +66,7 @@ __m128i test_mm_srli_si128_0(__m128i a) { // CHECK-LABEL: define dso_local <2 x i64> @test_mm_srli_si128_16( // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CAST:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> zeroinitializer // __m128i test_mm_srli_si128_16(__m128i a) { `````````` </details> https://github.com/llvm/llvm-project/pull/158671 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits