This implements the requested inlining of memmove for possibly overlapping arguments by doing first all loads and then all stores. The easiest place is to do this in memory op folding where we already perform inlining of some memcpy cases (but fail to do the equivalent memcpy optimization - though RTL expansion later does it).
The following patch restricts us to max. word-mode size. Ideally we'd have a way to check for the number of real instructions needed to load an (aligned) value of size N. But maybe we don't care and are fine with doing multiple loads / stores? Anyway, the following is conservative (but maybe not enough). Bootstrap / regtest running on x86_64-unknown-linux-gnu. These transforms don't really belong to GENERIC folding (they also run at -O0 ...), similar to most builtin foldings. But this patch is not to change that. Any comments on the size/cost issue? Thanks, Richard. 2014-06-12 Richard Biener <rguent...@suse.de> PR middle-end/61473 * builtins.c (fold_builtin_memory_op): Inline memory moves that can be implemented with a single load followed by a single store. * gcc.dg/memmove-4.c: New testcase. Index: gcc/builtins.c =================================================================== --- gcc/builtins.c (revision 211449) +++ gcc/builtins.c (working copy) @@ -8637,11 +8637,53 @@ fold_builtin_memory_op (location_t loc, unsigned int src_align, dest_align; tree off0; - if (endp == 3) + /* Build accesses at offset zero with a ref-all character type. */ + off0 = build_int_cst (build_pointer_type_for_mode (char_type_node, + ptr_mode, true), 0); + + /* If we can perform the copy efficiently with first doing all loads + and then all stores inline it that way. Currently efficiently + means that we can load all the memory into a single integer + register and thus limited to word_mode size. Ideally we'd have + a way to query the largest mode that we can load/store with + a signle instruction. */ + src_align = get_pointer_alignment (src); + dest_align = get_pointer_alignment (dest); + if (tree_fits_uhwi_p (len) + && compare_tree_int (len, BITS_PER_WORD / 8) <= 0) { - src_align = get_pointer_alignment (src); - dest_align = get_pointer_alignment (dest); + unsigned ilen = tree_to_uhwi (len); + if (exact_log2 (ilen) != -1) + { + tree type = lang_hooks.types.type_for_size (ilen * 8, 1); + if (type + && TYPE_MODE (type) != BLKmode + && (GET_MODE_SIZE (TYPE_MODE (type)) * BITS_PER_UNIT + == ilen * 8) + /* If the pointers are not aligned we must be able to + emit an unaligned load. */ + && ((src_align >= GET_MODE_ALIGNMENT (TYPE_MODE (type)) + && dest_align >= GET_MODE_ALIGNMENT (TYPE_MODE (type))) + || !SLOW_UNALIGNED_ACCESS (TYPE_MODE (type), + MIN (src_align, dest_align)))) + { + tree srctype = type; + tree desttype = type; + if (src_align < GET_MODE_ALIGNMENT (TYPE_MODE (type))) + srctype = build_aligned_type (type, src_align); + if (dest_align < GET_MODE_ALIGNMENT (TYPE_MODE (type))) + desttype = build_aligned_type (type, dest_align); + destvar = fold_build2 (MEM_REF, desttype, dest, off0); + expr = build2 (MODIFY_EXPR, type, + fold_build2 (MEM_REF, desttype, dest, off0), + fold_build2 (MEM_REF, srctype, src, off0)); + goto done; + } + } + } + if (endp == 3) + { /* Both DEST and SRC must be pointer types. ??? This is what old code did. Is the testing for pointer types really mandatory? @@ -8818,10 +8860,6 @@ fold_builtin_memory_op (location_t loc, if (!ignore) dest = builtin_save_expr (dest); - /* Build accesses at offset zero with a ref-all character type. */ - off0 = build_int_cst (build_pointer_type_for_mode (char_type_node, - ptr_mode, true), 0); - destvar = dest; STRIP_NOPS (destvar); if (TREE_CODE (destvar) == ADDR_EXPR @@ -8888,6 +8926,7 @@ fold_builtin_memory_op (location_t loc, expr = build2 (MODIFY_EXPR, TREE_TYPE (destvar), destvar, srcvar); } +done: if (ignore) return expr; Index: gcc/testsuite/gcc.dg/memmove-4.c =================================================================== --- gcc/testsuite/gcc.dg/memmove-4.c (revision 0) +++ gcc/testsuite/gcc.dg/memmove-4.c (working copy) @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-O -fdump-tree-optimized" } */ + +typedef int w __attribute__((mode(word))); + +void b(char *a, char *b, int i) +{ + __builtin_memmove (&a[i], &b[i], sizeof(w)); +} + +/* { dg-final { scan-tree-dump-not "memmove" "optimized" { xfail { ! non_strict_align } } } } */ +/* { dg-final { cleanup-tree-dump "optimized" } } */