https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80286
--- Comment #4 from Jakub Jelinek <jakub at gcc dot gnu.org> --- Yet another testcase, showing that we can generate very inefficient code: typedef short V __attribute__((vector_size (8 * sizeof (short)))); __attribute__((noinline, noclone)) V foo (V x, V y) { return x << y[0]; } int main () { V x = { 1, 2, 3, 4, 5, 6, 7, 8 }; V y = { 5, 6, 7, 8, 9, 10, 11, 12 }; V z = foo (x, y); V e = { 1 << 5, 2 << 5, 3 << 5, 4 << 5, 5 << 5, 6 << 5, 7 << 5, 8 << 5 }; if (__builtin_memcmp (&z, &e, sizeof (V))) __builtin_abort (); return 0; } vpextrw $0, %xmm1, %eax cwtl movl %eax, -12(%rsp) vmovd -12(%rsp), %xmm2 vpsllw %xmm2, %xmm0, %xmm0 For SSE4.1+, we could as well emit vpmovsxwq for the sign-extension followed immediately by vpsllw. For say SSE2 at least for the unsigned int -> unsigned long zero extension we could use pxor (to get 0) and punpck*.