https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81502
--- Comment #2 from Richard Biener <rguenth at gcc dot gnu.org> --- Note that with -mtune=intel we already get _Z3barPv: .LFB526: .cfi_startproc movq %rdi, %xmm0 movd %xmm0, %eax ret but yes, the intermediate temporary is unnecessary. We do not optimize this to a BIT_INSERT_EXPR because the vector element type doesn't match the insertion quantity. We can relax that a bit with the following: Index: gcc/tree-ssa.c =================================================================== --- gcc/tree-ssa.c (revision 250386) +++ gcc/tree-ssa.c (working copy) @@ -1513,8 +1513,8 @@ non_rewritable_lvalue_p (tree lhs) if (DECL_P (decl) && VECTOR_TYPE_P (TREE_TYPE (decl)) && TYPE_MODE (TREE_TYPE (decl)) != BLKmode - && types_compatible_p (TREE_TYPE (lhs), - TREE_TYPE (TREE_TYPE (decl))) + && operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (lhs)), + TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (decl))), 0) && tree_fits_uhwi_p (TREE_OPERAND (lhs, 1)) && tree_int_cst_lt (TREE_OPERAND (lhs, 1), TYPE_SIZE_UNIT (TREE_TYPE (decl))) @@ -1839,8 +1839,9 @@ execute_update_addresses_taken (void) && bitmap_bit_p (suitable_for_renaming, DECL_UID (sym)) && VECTOR_TYPE_P (TREE_TYPE (sym)) && TYPE_MODE (TREE_TYPE (sym)) != BLKmode - && types_compatible_p (TREE_TYPE (lhs), - TREE_TYPE (TREE_TYPE (sym))) + && operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (lhs)), + TYPE_SIZE_UNIT + (TREE_TYPE (TREE_TYPE (sym))), 0) && tree_fits_uhwi_p (TREE_OPERAND (lhs, 1)) && tree_int_cst_lt (TREE_OPERAND (lhs, 1), TYPE_SIZE_UNIT (TREE_TYPE (sym))) @@ -1848,6 +1849,18 @@ execute_update_addresses_taken (void) % tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (lhs)))) == 0) { tree val = gimple_assign_rhs1 (stmt); + if (! types_compatible_p (TREE_TYPE (lhs), + TREE_TYPE (TREE_TYPE (sym)))) + { + tree tem = make_ssa_name (TREE_TYPE (TREE_TYPE (sym))); + gimple *pun + = gimple_build_assign (tem, + build1 (VIEW_CONVERT_EXPR, + TREE_TYPE (TREE_TYPE + (sym)), val)); + gsi_insert_before (&gsi, pun, GSI_SAME_STMT); + val = tem; + } tree bitpos = wide_int_to_tree (bitsizetype, mem_ref_offset (lhs) * BITS_PER_UNIT); this gets us to int bar(void*) (void * ptr) { int res; __m128i word; long long int _2; unsigned int _4; <bb 2> [100.00%] [count: INV]: _2 = (long long int) ptr_6(D); word_3 = BIT_INSERT_EXPR <{ 0, 0 }, _2, 0 (64 bits)>; _4 = BIT_FIELD_REF <word_3, 32, 0>; res_5 = (int) _4; return res_5; in .optimized which shows (already known) missed foldings for bit-field-ref of bit-insert. That's a complicated one btw, extracting a component from a vector insert. Oh, and it misses bit-insert -> CONSTRUCTOR, thus word_3 = { _2, 0 }; (simplify (bit_insert VECTOR_CST@0 @1 @2) { vec<constructor_elt, va_gc> *v; vec_alloc (v, TYPE_VECTOR_SUBPARTS (type)); for (unsigned i = 0; i < VECTOR_CST_NELTS (@0); ++i) { constructor_elt elt = { NULL_TREE, VECTOR_CST_ELT (@0, i) }; v->quick_push (elt); } (*v)[TREE_INT_CST_LOW (@2) / TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (type)))].value = @1; build_constructor (type, v); }) that gets us to <bb 2> [100.00%] [count: INV]: _2 = (long long int) ptr_6(D); word_3 = {_2, 0}; _4 = BIT_FIELD_REF <word_3, 32, 0>; res_5 = (int) _4; return res_5; where we still need that BIT_FIELD_REF simplification. The IL is already in this form when we run into FRE1 so handling it there should be possible in principle. Or we can fold word_3 = {_2, 0}; _4 = BIT_FIELD_REF <word_3, 32, 0>; to _4 = BIT_FIELD_REF <_2, 32, 0 [+adjustment]>; thus a BIT_FIELD_REF on a CONSTRUCTOR to one on the element.