Filipe points out that we cannot use a #define for the __builtin since it has to be available even when no .h is include. Optimizing the builtin would then require custom code in clang/llvm, which is probably not worth it.
================ Comment at: lib/Headers/avx2intrin.h:163 @@ -162,2 +162,3 @@ __m256i __V2 = (V2); \ - (__m256i)__builtin_ia32_pblendw256((__v16hi)__V1, (__v16hi)__V2, (M)); }) + (__m256d)__builtin_shufflevector((__v16hi)__V1, (__v16hi)__V2, \ + (((M) & 0x01) ? 16 : 0), \ ---------------- Why the change to __m256d? The intel manual says the signature is __m256i _mm256_blend_epi16 (__m256i v1, __m256i v2, const int mask) ================ Comment at: lib/Headers/avx2intrin.h:168 @@ +167,3 @@ + (((M) & 0x08) ? 19 : 3), \ + (((M) & 0x04) ? 20 : 4), \ + (((M) & 0x04) ? 21 : 5), \ ---------------- The masks looks wrong for the hight bits. Shouldn't this be (((M) & 0x01) ? 16 : 0), \ (((M) & 0x02) ? 17 : 1), \ (((M) & 0x04) ? 18 : 2), \ (((M) & 0x08) ? 19 : 3), \ (((M) & 0x10) ? 20 : 4), \ (((M) & 0x20) ? 21 : 5), \ (((M) & 0x40) ? 22 : 6), \ (((M) & 0x80) ? 23 : 7), \ (((M) & 0x01) ? 24 : 8), \ (((M) & 0x02) ? 25 : 9), \ (((M) & 0x04) ? 26 : 10), \ (((M) & 0x08) ? 27 : 11), \ (((M) & 0x10) ? 28 : 12), \ (((M) & 0x20) ? 29 : 13), \ (((M) & 0x40) ? 30 : 14), \ (((M) & 0x80) ? 31 : 15), \ http://reviews.llvm.org/D3601 _______________________________________________ cfe-commits mailing list [email protected] http://lists.cs.uiuc.edu/mailman/listinfo/cfe-commits
