Re: calloc = malloc + memset
On 02/28/2014 11:48 PM, Marc Glisse wrote: /* Optimize + ptr = malloc (n); + memset (ptr, 0, n); + into + ptr = calloc (n); + gsi_p is known to point to a call to __builtin_memset. */ Is there anything in here to prevent us making an infinite loop if the above pattern occurs in a function called calloc? Bernd
Re: calloc = malloc + memset
On Tue, Jul 15, 2014 at 2:33 PM, Bernd Schmidt bernds_...@t-online.de wrote: On 02/28/2014 11:48 PM, Marc Glisse wrote: /* Optimize + ptr = malloc (n); + memset (ptr, 0, n); + into + ptr = calloc (n); + gsi_p is known to point to a call to __builtin_memset. */ Is there anything in here to prevent us making an infinite loop if the above pattern occurs in a function called calloc? Nothing. See how I ended up doing 2014-05-06 Richard Biener rguent...@suse.de * c-opts.c (c_common_post_options): For -freestanding, -fno-hosted and -fno-builtin disable pattern recognition if not enabled explicitely. to avoid sth like this for memset/memcpy/memmove recognition. Richard. Bernd
Re: calloc = malloc + memset
On Tue, Jun 03, 2014 at 04:00:17PM +0200, Marc Glisse wrote: Ping? Ok for trunk, sorry for the delay. Jakub
Re: calloc = malloc + memset
On Mon, 23 Jun 2014, Jakub Jelinek wrote: Ok for trunk, sorry for the delay. Thanks. Richard has moved the passes a bit since then, but I still have exactly one spot where the testsuite is ok :-) I need strlen to be after dom (for calloc.C) and before vrp (for several strlenopt-*.c). I'll commit it tomorrow if there aren't any comments on the pass placement. 2014-06-24 Marc Glisse marc.gli...@inria.fr PR tree-optimization/57742 gcc/ * tree-ssa-strlen.c (get_string_length): Ignore malloc. (handle_builtin_malloc, handle_builtin_memset): New functions. (strlen_optimize_stmt): Call them. * passes.def: Move strlen after loop+dom but before vrp. gcc/testsuite/ * g++.dg/tree-ssa/calloc.C: New testcase. * gcc.dg/tree-ssa/calloc-1.c: Likewise. * gcc.dg/tree-ssa/calloc-2.c: Likewise. * gcc.dg/strlenopt-9.c: Adapt. -- Marc GlisseIndex: gcc/passes.def === --- gcc/passes.def (revision 211886) +++ gcc/passes.def (working copy) @@ -179,21 +179,20 @@ along with GCC; see the file COPYING3. DOM and erroneous path isolation should be due to degenerate PHI nodes. So rather than run the full propagators, run a specialized pass which only examines PHIs to discover const/copy propagation opportunities. */ NEXT_PASS (pass_phi_only_cprop); NEXT_PASS (pass_dse); NEXT_PASS (pass_reassoc); NEXT_PASS (pass_dce); NEXT_PASS (pass_forwprop); NEXT_PASS (pass_phiopt); - NEXT_PASS (pass_strlen); NEXT_PASS (pass_ccp); /* After CCP we rewrite no longer addressed locals into SSA form if possible. */ NEXT_PASS (pass_copy_prop); NEXT_PASS (pass_cse_sincos); NEXT_PASS (pass_optimize_bswap); NEXT_PASS (pass_split_crit_edges); NEXT_PASS (pass_pre); NEXT_PASS (pass_sink_code); NEXT_PASS (pass_asan); @@ -232,20 +231,21 @@ along with GCC; see the file COPYING3. NEXT_PASS (pass_loop_prefetch); NEXT_PASS (pass_iv_optimize); NEXT_PASS (pass_lim); NEXT_PASS (pass_tree_loop_done); POP_INSERT_PASSES () NEXT_PASS (pass_lower_vector_ssa); NEXT_PASS (pass_cse_reciprocals); NEXT_PASS (pass_reassoc); NEXT_PASS (pass_strength_reduction); NEXT_PASS (pass_dominator); + NEXT_PASS (pass_strlen); NEXT_PASS (pass_vrp); /* The only const/copy propagation opportunities left after DOM and VRP should be due to degenerate PHI nodes. So rather than run the full propagators, run a specialized pass which only examines PHIs to discover const/copy propagation opportunities. */ NEXT_PASS (pass_phi_only_cprop); NEXT_PASS (pass_cd_dce); NEXT_PASS (pass_tracer); NEXT_PASS (pass_dse); Index: gcc/testsuite/g++.dg/tree-ssa/calloc.C === --- gcc/testsuite/g++.dg/tree-ssa/calloc.C (revision 0) +++ gcc/testsuite/g++.dg/tree-ssa/calloc.C (working copy) @@ -0,0 +1,50 @@ +/* { dg-do compile } */ +/* { dg-options -O3 -fdump-tree-optimized } */ + +typedef __SIZE_TYPE__ size_t; +inline void* operator new(size_t, void* p) throw() { return p; } + +typedef void (*handler_t)(void); +extern handler_t get_handle(); + +inline void* operator new(size_t sz) +{ + void *p; + + if (sz == 0) +sz = 1; + + while ((p = __builtin_malloc (sz)) == 0) +{ + handler_t handler = get_handle (); + if (! handler) +throw 42; + handler (); +} + return p; +} + +struct vect { + int *start, *end; + vect(size_t n) { +start = end = 0; +if (n (size_t)-1 / sizeof(int)) + throw 33; +if (n != 0) + start = static_castint* (operator new (n * sizeof(int))); +end = start + n; +int *p = start; +for (size_t l = n; l 0; --l, ++p) + *p = 0; + } +}; + +void f (void *p, int n) +{ + new (p) vect(n); +} + +/* { dg-final { scan-tree-dump-times calloc 1 optimized } } */ +/* { dg-final { scan-tree-dump-not malloc optimized } } */ +/* { dg-final { scan-tree-dump-not memset optimized } } */ +/* { dg-final { cleanup-tree-dump optimized } } */ Index: gcc/testsuite/gcc.dg/strlenopt-9.c === --- gcc/testsuite/gcc.dg/strlenopt-9.c (revision 211886) +++ gcc/testsuite/gcc.dg/strlenopt-9.c (working copy) @@ -11,21 +11,21 @@ fn1 (int r) optimized away. */ return strchr (p, '\0'); } __attribute__((noinline, noclone)) size_t fn2 (int r) { char *p, q[10]; strcpy (q, abc); p = r ? a : q; - /* String length for p varies, therefore strlen below isn't + /* String length is constant for both alternatives, and strlen is optimized away. */ return strlen (p); } __attribute__((noinline, noclone)) size_t fn3 (char *p, int n) { int
Re: calloc = malloc + memset
On June 23, 2014 5:51:30 PM CEST, Marc Glisse marc.gli...@inria.fr wrote: On Mon, 23 Jun 2014, Jakub Jelinek wrote: Ok for trunk, sorry for the delay. Thanks. Richard has moved the passes a bit since then, but I still have exactly one spot where the testsuite is ok :-) I need strlen to be after dom (for calloc.C) and before vrp (for several strlenopt-*.c). I'll commit it tomorrow if there aren't any comments on the pass placement. But vrp does not run at -O1 - does strlenopt? 2014-06-24 Marc Glisse marc.gli...@inria.fr PR tree-optimization/57742 gcc/ * tree-ssa-strlen.c (get_string_length): Ignore malloc. (handle_builtin_malloc, handle_builtin_memset): New functions. (strlen_optimize_stmt): Call them. * passes.def: Move strlen after loop+dom but before vrp. gcc/testsuite/ * g++.dg/tree-ssa/calloc.C: New testcase. * gcc.dg/tree-ssa/calloc-1.c: Likewise. * gcc.dg/tree-ssa/calloc-2.c: Likewise. * gcc.dg/strlenopt-9.c: Adapt.
Re: calloc = malloc + memset
On Mon, 23 Jun 2014, Richard Biener wrote: On June 23, 2014 5:51:30 PM CEST, Marc Glisse marc.gli...@inria.fr wrote: On Mon, 23 Jun 2014, Jakub Jelinek wrote: Ok for trunk, sorry for the delay. Thanks. Richard has moved the passes a bit since then, but I still have exactly one spot where the testsuite is ok :-) I need strlen to be after dom (for calloc.C) and before vrp (for several strlenopt-*.c). I'll commit it tomorrow if there aren't any comments on the pass placement. But vrp does not run at -O1 - does strlenopt? { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_foptimize_strlen, NULL, 1 }, { OPT_LEVELS_2_PLUS, OPT_ftree_vrp, NULL, 1 }, So that's just a missed optimization at -Os, I guess. -- Marc Glisse
Re: calloc = malloc + memset
On Mon, Jun 23, 2014 at 6:19 PM, Marc Glisse marc.gli...@inria.fr wrote: On Mon, 23 Jun 2014, Richard Biener wrote: On June 23, 2014 5:51:30 PM CEST, Marc Glisse marc.gli...@inria.fr wrote: On Mon, 23 Jun 2014, Jakub Jelinek wrote: Ok for trunk, sorry for the delay. Thanks. Richard has moved the passes a bit since then, but I still have exactly one spot where the testsuite is ok :-) I need strlen to be after dom (for calloc.C) and before vrp (for several strlenopt-*.c). I'll commit it tomorrow if there aren't any comments on the pass placement. But vrp does not run at -O1 - does strlenopt? { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_foptimize_strlen, NULL, 1 }, { OPT_LEVELS_2_PLUS, OPT_ftree_vrp, NULL, 1 }, So that's just a missed optimization at -Os, I guess. Ok, that's fine (not sure why we restrict all of strilenopt instead of just those transforms that are harmful for -Os). Richard. -- Marc Glisse
Re: calloc = malloc + memset
Marc Glisse marc.gli...@inria.fr writes: Hello, this is a stage 1 patch, and I'll ping it then, but if you have comments now... FWIW i believe the transformation will break a large variety of micro benchmarks. calloc internally knows that memory fresh from the OS is zeroed. But the memory may not be faulted in yet. memset always faults in the memory. So if you have some test like buf = malloc(...) memset(buf, ...) start = get_time(); ... do something with buf end = get_time() Now the times will be completely off because the measured times includes the page faults. -Andi -- a...@linux.intel.com -- Speaking for myself only
Re: calloc = malloc + memset
On Mon, Jun 23, 2014 at 11:17 AM, Andi Kleen a...@firstfloor.org wrote: Marc Glisse marc.gli...@inria.fr writes: Hello, this is a stage 1 patch, and I'll ping it then, but if you have comments now... FWIW i believe the transformation will break a large variety of micro benchmarks. calloc internally knows that memory fresh from the OS is zeroed. But the memory may not be faulted in yet. memset always faults in the memory. So if you have some test like buf = malloc(...) memset(buf, ...) start = get_time(); ... do something with buf end = get_time() Now the times will be completely off because the measured times includes the page faults. Easy way for these benchmarks to get around this. volatile char *vbuf = (char*)buf; for(i=0;ibufsize;i++) *vbuf = 0; before get_time (); Now there is no way for the compiler to optimize away the inlined memset and will always be 100% correct in the future. Also micro-benchmarking is going to have issues like this too with future optimizations. Thanks, Andrew -Andi -- a...@linux.intel.com -- Speaking for myself only
Re: calloc = malloc + memset
On Mon, 23 Jun 2014, Andi Kleen wrote: FWIW i believe the transformation will break a large variety of micro benchmarks. calloc internally knows that memory fresh from the OS is zeroed. But the memory may not be faulted in yet. memset always faults in the memory. So if you have some test like buf = malloc(...) memset(buf, ...) start = get_time(); ... do something with buf end = get_time() Now the times will be completely off because the measured times includes the page faults. Good point. I guess working around compiler optimizations is part of the game for micro benchmarks, and their authors would be disappointed if the compiler didn't mess it up regularly in new and entertaining ways ;-) -- Marc Glisse
Re: calloc = malloc + memset
On Mon, Jun 23, 2014 at 09:00:02PM +0200, Marc Glisse wrote: On Mon, 23 Jun 2014, Andi Kleen wrote: FWIW i believe the transformation will break a large variety of micro benchmarks. calloc internally knows that memory fresh from the OS is zeroed. But the memory may not be faulted in yet. memset always faults in the memory. So if you have some test like buf = malloc(...) memset(buf, ...) start = get_time(); ... do something with buf end = get_time() Now the times will be completely off because the measured times includes the page faults. Good point. I guess working around compiler optimizations is part of the game for micro benchmarks, and their authors would be disappointed if the compiler didn't mess it up regularly in new and entertaining ways ;-) I would prefer to not do it. I'm not sure it has a lot of benefit. If you want to keep it please make sure there is an easy way to turn it off. -Andi -- a...@linux.intel.com -- Speaking for myself only.
Re: calloc = malloc + memset
On Mon, 23 Jun 2014, Andi Kleen wrote: I would prefer to not do it. For the sake of micro benchmarks? I'm not sure it has a lot of benefit. It has a non-zero benefit. If you want to keep it please make sure there is an easy way to turn it off. Any of these flags works: -fdisable-tree-strlen -fno-builtin-malloc -fno-builtin-memset (assuming you wrote 'memset' explicitly in your code) -fno-builtin -ffreestanding -O1 -Os In the code, you can hide that the pointer passed to memset is the one returned by malloc by storing it in a volatile variable, or any other trick to hide from the compiler that we are doing memset(malloc(n),0,n). -- Marc Glisse
Re: calloc = malloc + memset
On Mon, Jun 23, 2014 at 10:14:25PM +0200, Marc Glisse wrote: On Mon, 23 Jun 2014, Andi Kleen wrote: I would prefer to not do it. For the sake of micro benchmarks? Yes benchmarks are important. -Andi
Re: calloc = malloc + memset
On Mon, Jun 23, 2014 at 1:21 PM, Andi Kleen a...@firstfloor.org wrote: On Mon, Jun 23, 2014 at 10:14:25PM +0200, Marc Glisse wrote: On Mon, 23 Jun 2014, Andi Kleen wrote: I would prefer to not do it. For the sake of micro benchmarks? Yes benchmarks are important. But micro-benchmarks are not important. In fact this patch could improve some benchmarks as you no longer thrash your cache. So benchmarks are important but micro-benchmarks are not. Thanks, Andrew -Andi
Re: calloc = malloc + memset
Ping? On Sat, 17 May 2014, Marc Glisse wrote: Ping Jakub? https://gcc.gnu.org/ml/gcc-patches/2014-04/msg01104.html On Wed, 23 Apr 2014, Richard Biener wrote: On Fri, Apr 18, 2014 at 8:27 PM, Marc Glisse marc.gli...@inria.fr wrote: Thanks for the comments! On Fri, 18 Apr 2014, Jakub Jelinek wrote: The passes.def change makes me a little bit nervous, but if it works, perhaps. Would you prefer running the pass twice? I thought there would be less resistance to moving the pass than duplicating it. Indeed. I think placing it after loops and CSE (thus what you have done) makes sense. strlenopt itself shouldn't enable much additional optimizations. But well, pass ordering is always tricky. Didn't look at the rest of the changes, but Jakub is certainly able to approve the patch so I leave it to him. -- Marc Glisse
Re: calloc = malloc + memset
Ping Jakub? https://gcc.gnu.org/ml/gcc-patches/2014-04/msg01104.html On Wed, 23 Apr 2014, Richard Biener wrote: On Fri, Apr 18, 2014 at 8:27 PM, Marc Glisse marc.gli...@inria.fr wrote: Thanks for the comments! On Fri, 18 Apr 2014, Jakub Jelinek wrote: The passes.def change makes me a little bit nervous, but if it works, perhaps. Would you prefer running the pass twice? I thought there would be less resistance to moving the pass than duplicating it. Indeed. I think placing it after loops and CSE (thus what you have done) makes sense. strlenopt itself shouldn't enable much additional optimizations. But well, pass ordering is always tricky. Didn't look at the rest of the changes, but Jakub is certainly able to approve the patch so I leave it to him. Thanks, Richard. By the way, I think even passes we run only once should have the required functions implemented so they can be run several times (at least most of them), in case users want to do that in plugins. I was surprised when I tried adding a second strlen pass and the compiler refused. --- gcc/testsuite/g++.dg/tree-ssa/calloc.C (revision 0) +++ gcc/testsuite/g++.dg/tree-ssa/calloc.C (working copy) @@ -0,0 +1,35 @@ +/* { dg-do compile { target c++11 } } */ +/* { dg-options -O3 -fdump-tree-optimized } */ + +#include new +#include vector +#include cstdlib + +void g(void*); +inline void* operator new(std::size_t sz) +{ + void *p; + + if (sz == 0) +sz = 1; + + // Slightly modified from the libsupc++ version, that one has 2 calls + // to malloc which makes it too hard to optimize. + while ((p = std::malloc (sz)) == 0) +{ + std::new_handler handler = std::get_new_handler (); + if (! handler) +throw std::bad_alloc(); + handler (); +} + return p; +} + +void f(void*p,int n){ + new(p)std::vectorint(n); +} + +/* { dg-final { scan-tree-dump-times calloc 1 optimized } } */ +/* { dg-final { scan-tree-dump-not malloc optimized } } */ +/* { dg-final { scan-tree-dump-not memset optimized } } */ +/* { dg-final { cleanup-tree-dump optimized } } */ This looks to me way too much fragile, any time the libstdc++ or glibc headers change a little bit, you might need to adjust the dg-final directives. Much better would be if you just provided the prototypes yourself and subset of the std::vector you really need for the testcase. You can throw some class or int, it doesn't have to be std::bad_alloc, etc. I don't understand what seems so fragile to you. There is a single function in the .optimized dump, which just calls calloc in a loop. It doesn't seem that likely that a change in glibc/libstdc++ would make an extra memset pop up. A change in libstdc++ could easily prevent the optimization completely (I'd like to hope we can avoid that, half of the purpose of the testcase was making sure libstdc++ didn't change in a bad way), but I don't really see how it could keep it in a way that requires tweaking dg-final. While trying to write a standalone version, I hit again many missed optimizations, getting such nice things in the .optimized dump as: _12 = p_13 + sz_7; if (_12 != p_13) or: _12 = p_13 + sz_7; _30 = (unsigned long) _12; _9 = p_13 + 4; _10 = (unsigned long) _9; _11 = _30 - _10; _22 = _11 /[ex] 4; _21 = _22; _40 = _21 + 1; _34 = _40 * 4; It is embarrassing... I hope the combiner GSoC will work well and we can just add a dozen patterns to handle this before 4.10. --- gcc/testsuite/gcc.dg/strlenopt-9.c (revision 208772) +++ gcc/testsuite/gcc.dg/strlenopt-9.c (working copy) @@ -11,21 +11,21 @@ fn1 (int r) optimized away. */ return strchr (p, '\0'); } __attribute__((noinline, noclone)) size_t fn2 (int r) { char *p, q[10]; strcpy (q, abc); p = r ? a : q; - /* String length for p varies, therefore strlen below isn't + /* String length is constant for both alternatives, and strlen is optimized away. */ return strlen (p); Is this because of jump threading? It is PRE that turns: if (r_4(D) == 0) goto bb 5; else goto bb 3; bb 5: goto bb 4; bb 3: bb 4: # p_1 = PHI q(5), a(3) _5 = __builtin_strlen (p_1); into: if (r_4(D) == 0) goto bb 5; else goto bb 3; bb 5: _7 = __builtin_strlen (q); pretmp_8 = _7; goto bb 4; bb 3: bb 4: # p_1 = PHI q(5), a(3) # prephitmp_9 = PHI pretmp_8(5), 1(3) _5 = prephitmp_9; It says: Found partial redundancy for expression {call_expr__builtin_strlen,p_1}@.MEM_3 (0005) --- gcc/testsuite/gcc.dg/tree-ssa/calloc-1.c(revision 0) +++ gcc/testsuite/gcc.dg/tree-ssa/calloc-1.c(working copy) @@ -0,0 +1,29 @@ +/* { dg-do compile } */ +/* { dg-options -O2 -fdump-tree-optimized } */ + +#include stdlib.h +#include string.h Even this I find unsafe. The strlenopt*.c tests use it's custom strlenopt.h header for a reason, you might just add a calloc prototype in there and use that header. Might as well use __builtin_* then. +/* Handle a call
Re: calloc = malloc + memset
On Fri, Apr 18, 2014 at 8:27 PM, Marc Glisse marc.gli...@inria.fr wrote: Thanks for the comments! On Fri, 18 Apr 2014, Jakub Jelinek wrote: The passes.def change makes me a little bit nervous, but if it works, perhaps. Would you prefer running the pass twice? I thought there would be less resistance to moving the pass than duplicating it. Indeed. I think placing it after loops and CSE (thus what you have done) makes sense. strlenopt itself shouldn't enable much additional optimizations. But well, pass ordering is always tricky. Didn't look at the rest of the changes, but Jakub is certainly able to approve the patch so I leave it to him. Thanks, Richard. By the way, I think even passes we run only once should have the required functions implemented so they can be run several times (at least most of them), in case users want to do that in plugins. I was surprised when I tried adding a second strlen pass and the compiler refused. --- gcc/testsuite/g++.dg/tree-ssa/calloc.C (revision 0) +++ gcc/testsuite/g++.dg/tree-ssa/calloc.C (working copy) @@ -0,0 +1,35 @@ +/* { dg-do compile { target c++11 } } */ +/* { dg-options -O3 -fdump-tree-optimized } */ + +#include new +#include vector +#include cstdlib + +void g(void*); +inline void* operator new(std::size_t sz) +{ + void *p; + + if (sz == 0) +sz = 1; + + // Slightly modified from the libsupc++ version, that one has 2 calls + // to malloc which makes it too hard to optimize. + while ((p = std::malloc (sz)) == 0) +{ + std::new_handler handler = std::get_new_handler (); + if (! handler) +throw std::bad_alloc(); + handler (); +} + return p; +} + +void f(void*p,int n){ + new(p)std::vectorint(n); +} + +/* { dg-final { scan-tree-dump-times calloc 1 optimized } } */ +/* { dg-final { scan-tree-dump-not malloc optimized } } */ +/* { dg-final { scan-tree-dump-not memset optimized } } */ +/* { dg-final { cleanup-tree-dump optimized } } */ This looks to me way too much fragile, any time the libstdc++ or glibc headers change a little bit, you might need to adjust the dg-final directives. Much better would be if you just provided the prototypes yourself and subset of the std::vector you really need for the testcase. You can throw some class or int, it doesn't have to be std::bad_alloc, etc. I don't understand what seems so fragile to you. There is a single function in the .optimized dump, which just calls calloc in a loop. It doesn't seem that likely that a change in glibc/libstdc++ would make an extra memset pop up. A change in libstdc++ could easily prevent the optimization completely (I'd like to hope we can avoid that, half of the purpose of the testcase was making sure libstdc++ didn't change in a bad way), but I don't really see how it could keep it in a way that requires tweaking dg-final. While trying to write a standalone version, I hit again many missed optimizations, getting such nice things in the .optimized dump as: _12 = p_13 + sz_7; if (_12 != p_13) or: _12 = p_13 + sz_7; _30 = (unsigned long) _12; _9 = p_13 + 4; _10 = (unsigned long) _9; _11 = _30 - _10; _22 = _11 /[ex] 4; _21 = _22; _40 = _21 + 1; _34 = _40 * 4; It is embarrassing... I hope the combiner GSoC will work well and we can just add a dozen patterns to handle this before 4.10. --- gcc/testsuite/gcc.dg/strlenopt-9.c (revision 208772) +++ gcc/testsuite/gcc.dg/strlenopt-9.c (working copy) @@ -11,21 +11,21 @@ fn1 (int r) optimized away. */ return strchr (p, '\0'); } __attribute__((noinline, noclone)) size_t fn2 (int r) { char *p, q[10]; strcpy (q, abc); p = r ? a : q; - /* String length for p varies, therefore strlen below isn't + /* String length is constant for both alternatives, and strlen is optimized away. */ return strlen (p); Is this because of jump threading? It is PRE that turns: if (r_4(D) == 0) goto bb 5; else goto bb 3; bb 5: goto bb 4; bb 3: bb 4: # p_1 = PHI q(5), a(3) _5 = __builtin_strlen (p_1); into: if (r_4(D) == 0) goto bb 5; else goto bb 3; bb 5: _7 = __builtin_strlen (q); pretmp_8 = _7; goto bb 4; bb 3: bb 4: # p_1 = PHI q(5), a(3) # prephitmp_9 = PHI pretmp_8(5), 1(3) _5 = prephitmp_9; It says: Found partial redundancy for expression {call_expr__builtin_strlen,p_1}@.MEM_3 (0005) --- gcc/testsuite/gcc.dg/tree-ssa/calloc-1.c(revision 0) +++ gcc/testsuite/gcc.dg/tree-ssa/calloc-1.c(working copy) @@ -0,0 +1,29 @@ +/* { dg-do compile } */ +/* { dg-options -O2 -fdump-tree-optimized } */ + +#include stdlib.h +#include string.h Even this I find unsafe. The strlenopt*.c tests use it's custom strlenopt.h header for a reason, you might just add a calloc prototype in there and use that header. Might as well use __builtin_* then. +/* Handle a
Re: calloc = malloc + memset
Thanks for the comments! On Fri, 18 Apr 2014, Jakub Jelinek wrote: The passes.def change makes me a little bit nervous, but if it works, perhaps. Would you prefer running the pass twice? I thought there would be less resistance to moving the pass than duplicating it. By the way, I think even passes we run only once should have the required functions implemented so they can be run several times (at least most of them), in case users want to do that in plugins. I was surprised when I tried adding a second strlen pass and the compiler refused. --- gcc/testsuite/g++.dg/tree-ssa/calloc.C (revision 0) +++ gcc/testsuite/g++.dg/tree-ssa/calloc.C (working copy) @@ -0,0 +1,35 @@ +/* { dg-do compile { target c++11 } } */ +/* { dg-options -O3 -fdump-tree-optimized } */ + +#include new +#include vector +#include cstdlib + +void g(void*); +inline void* operator new(std::size_t sz) +{ + void *p; + + if (sz == 0) +sz = 1; + + // Slightly modified from the libsupc++ version, that one has 2 calls + // to malloc which makes it too hard to optimize. + while ((p = std::malloc (sz)) == 0) +{ + std::new_handler handler = std::get_new_handler (); + if (! handler) +throw std::bad_alloc(); + handler (); +} + return p; +} + +void f(void*p,int n){ + new(p)std::vectorint(n); +} + +/* { dg-final { scan-tree-dump-times calloc 1 optimized } } */ +/* { dg-final { scan-tree-dump-not malloc optimized } } */ +/* { dg-final { scan-tree-dump-not memset optimized } } */ +/* { dg-final { cleanup-tree-dump optimized } } */ This looks to me way too much fragile, any time the libstdc++ or glibc headers change a little bit, you might need to adjust the dg-final directives. Much better would be if you just provided the prototypes yourself and subset of the std::vector you really need for the testcase. You can throw some class or int, it doesn't have to be std::bad_alloc, etc. I don't understand what seems so fragile to you. There is a single function in the .optimized dump, which just calls calloc in a loop. It doesn't seem that likely that a change in glibc/libstdc++ would make an extra memset pop up. A change in libstdc++ could easily prevent the optimization completely (I'd like to hope we can avoid that, half of the purpose of the testcase was making sure libstdc++ didn't change in a bad way), but I don't really see how it could keep it in a way that requires tweaking dg-final. While trying to write a standalone version, I hit again many missed optimizations, getting such nice things in the .optimized dump as: _12 = p_13 + sz_7; if (_12 != p_13) or: _12 = p_13 + sz_7; _30 = (unsigned long) _12; _9 = p_13 + 4; _10 = (unsigned long) _9; _11 = _30 - _10; _22 = _11 /[ex] 4; _21 = _22; _40 = _21 + 1; _34 = _40 * 4; It is embarrassing... I hope the combiner GSoC will work well and we can just add a dozen patterns to handle this before 4.10. --- gcc/testsuite/gcc.dg/strlenopt-9.c (revision 208772) +++ gcc/testsuite/gcc.dg/strlenopt-9.c (working copy) @@ -11,21 +11,21 @@ fn1 (int r) optimized away. */ return strchr (p, '\0'); } __attribute__((noinline, noclone)) size_t fn2 (int r) { char *p, q[10]; strcpy (q, abc); p = r ? a : q; - /* String length for p varies, therefore strlen below isn't + /* String length is constant for both alternatives, and strlen is optimized away. */ return strlen (p); Is this because of jump threading? It is PRE that turns: if (r_4(D) == 0) goto bb 5; else goto bb 3; bb 5: goto bb 4; bb 3: bb 4: # p_1 = PHI q(5), a(3) _5 = __builtin_strlen (p_1); into: if (r_4(D) == 0) goto bb 5; else goto bb 3; bb 5: _7 = __builtin_strlen (q); pretmp_8 = _7; goto bb 4; bb 3: bb 4: # p_1 = PHI q(5), a(3) # prephitmp_9 = PHI pretmp_8(5), 1(3) _5 = prephitmp_9; It says: Found partial redundancy for expression {call_expr__builtin_strlen,p_1}@.MEM_3 (0005) --- gcc/testsuite/gcc.dg/tree-ssa/calloc-1.c(revision 0) +++ gcc/testsuite/gcc.dg/tree-ssa/calloc-1.c(working copy) @@ -0,0 +1,29 @@ +/* { dg-do compile } */ +/* { dg-options -O2 -fdump-tree-optimized } */ + +#include stdlib.h +#include string.h Even this I find unsafe. The strlenopt*.c tests use it's custom strlenopt.h header for a reason, you might just add a calloc prototype in there and use that header. Might as well use __builtin_* then. +/* Handle a call to malloc or calloc. */ + +static void +handle_builtin_malloc (enum built_in_function bcode, gimple_stmt_iterator *gsi) +{ + gimple stmt = gsi_stmt (*gsi); + tree lhs = gimple_call_lhs (stmt); + gcc_assert (get_stridx (lhs) == 0); + int idx = new_stridx (lhs); + tree length = NULL_TREE; + if (bcode == BUILT_IN_CALLOC) +length = build_int_cst (size_type_node, 0); Is this safe? I mean, if you call int a = 0; ptr = calloc (a, n); or ptr = calloc (n, a); or ptr = calloc (0, 0); etc., then there
Re: calloc = malloc + memset
Let me ping this. There's no hurry, but it may have got lost with 4.9 approaching. http://gcc.gnu.org/ml/gcc-patches/2014-03/msg01205.html On Sun, 23 Mar 2014, Marc Glisse wrote: On Mon, 3 Mar 2014, Richard Biener wrote: That's a bit much of ad-hoc pattern-matching ... wouldn't be p = malloc (n); memset (p, 0, n); transform better suited to the strlen opt pass? After all that tracks what 'string' is associated with a SSA name pointer through arbitrary satements using a lattice. Like this? I had to move the strlen pass after the loop passes (and after dom or everything was too dirty) but long enough before the end (some optimizations are necessary after strlen). As a bonus, one more strlen is optimized in the current testcases :-) Running the pass twice would be another option I guess (it would require implementing the clone method), but without a testcase showing it is needed... Passes bootstrap+testsuite on x86_64-linux-gnu. 2014-03-23 Marc Glisse marc.gli...@inria.fr PR tree-optimization/57742 gcc/ * tree-ssa-strlen.c (get_string_length): Ignore malloc. (handle_builtin_malloc, handle_builtin_memset): New functions. (strlen_optimize_stmt): Call them. * passes.def: Move strlen after loop+dom. gcc/testsuite/ * g++.dg/tree-ssa/calloc.C: New testcase. * gcc.dg/tree-ssa/calloc-1.c: Likewise. * gcc.dg/tree-ssa/calloc-2.c: Likewise. * gcc.dg/strlenopt-9.c: Adapt. -- Marc Glisse
Re: calloc = malloc + memset
On Mon, 3 Mar 2014, Richard Biener wrote: That's a bit much of ad-hoc pattern-matching ... wouldn't be p = malloc (n); memset (p, 0, n); transform better suited to the strlen opt pass? After all that tracks what 'string' is associated with a SSA name pointer through arbitrary satements using a lattice. Like this? I had to move the strlen pass after the loop passes (and after dom or everything was too dirty) but long enough before the end (some optimizations are necessary after strlen). As a bonus, one more strlen is optimized in the current testcases :-) Running the pass twice would be another option I guess (it would require implementing the clone method), but without a testcase showing it is needed... Passes bootstrap+testsuite on x86_64-linux-gnu. 2014-03-23 Marc Glisse marc.gli...@inria.fr PR tree-optimization/57742 gcc/ * tree-ssa-strlen.c (get_string_length): Ignore malloc. (handle_builtin_malloc, handle_builtin_memset): New functions. (strlen_optimize_stmt): Call them. * passes.def: Move strlen after loop+dom. gcc/testsuite/ * g++.dg/tree-ssa/calloc.C: New testcase. * gcc.dg/tree-ssa/calloc-1.c: Likewise. * gcc.dg/tree-ssa/calloc-2.c: Likewise. * gcc.dg/strlenopt-9.c: Adapt. -- Marc GlisseIndex: gcc/passes.def === --- gcc/passes.def (revision 208772) +++ gcc/passes.def (working copy) @@ -176,21 +176,20 @@ along with GCC; see the file COPYING3. DOM and erroneous path isolation should be due to degenerate PHI nodes. So rather than run the full propagators, run a specialized pass which only examines PHIs to discover const/copy propagation opportunities. */ NEXT_PASS (pass_phi_only_cprop); NEXT_PASS (pass_dse); NEXT_PASS (pass_reassoc); NEXT_PASS (pass_dce); NEXT_PASS (pass_forwprop); NEXT_PASS (pass_phiopt); - NEXT_PASS (pass_strlen); NEXT_PASS (pass_ccp); /* After CCP we rewrite no longer addressed locals into SSA form if possible. */ NEXT_PASS (pass_copy_prop); NEXT_PASS (pass_cse_sincos); NEXT_PASS (pass_optimize_bswap); NEXT_PASS (pass_split_crit_edges); NEXT_PASS (pass_pre); NEXT_PASS (pass_sink_code); NEXT_PASS (pass_asan); @@ -235,20 +234,21 @@ along with GCC; see the file COPYING3. NEXT_PASS (pass_cse_reciprocals); NEXT_PASS (pass_reassoc); NEXT_PASS (pass_strength_reduction); NEXT_PASS (pass_dominator); /* The only const/copy propagation opportunities left after DOM should be due to degenerate PHI nodes. So rather than run the full propagators, run a specialized pass which only examines PHIs to discover const/copy propagation opportunities. */ NEXT_PASS (pass_phi_only_cprop); + NEXT_PASS (pass_strlen); NEXT_PASS (pass_vrp); NEXT_PASS (pass_cd_dce); NEXT_PASS (pass_tracer); NEXT_PASS (pass_dse); NEXT_PASS (pass_forwprop); NEXT_PASS (pass_phiopt); NEXT_PASS (pass_fold_builtins); NEXT_PASS (pass_optimize_widening_mul); NEXT_PASS (pass_tail_calls); NEXT_PASS (pass_rename_ssa_copies); Index: gcc/testsuite/g++.dg/tree-ssa/calloc.C === --- gcc/testsuite/g++.dg/tree-ssa/calloc.C (revision 0) +++ gcc/testsuite/g++.dg/tree-ssa/calloc.C (working copy) @@ -0,0 +1,35 @@ +/* { dg-do compile { target c++11 } } */ +/* { dg-options -O3 -fdump-tree-optimized } */ + +#include new +#include vector +#include cstdlib + +void g(void*); +inline void* operator new(std::size_t sz) +{ + void *p; + + if (sz == 0) +sz = 1; + + // Slightly modified from the libsupc++ version, that one has 2 calls + // to malloc which makes it too hard to optimize. + while ((p = std::malloc (sz)) == 0) +{ + std::new_handler handler = std::get_new_handler (); + if (! handler) +throw std::bad_alloc(); + handler (); +} + return p; +} + +void f(void*p,int n){ + new(p)std::vectorint(n); +} + +/* { dg-final { scan-tree-dump-times calloc 1 optimized } } */ +/* { dg-final { scan-tree-dump-not malloc optimized } } */ +/* { dg-final { scan-tree-dump-not memset optimized } } */ +/* { dg-final { cleanup-tree-dump optimized } } */ Property changes on: gcc/testsuite/g++.dg/tree-ssa/calloc.C ___ Added: svn:keywords ## -0,0 +1 ## +Author Date Id Revision URL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: gcc/testsuite/gcc.dg/strlenopt-9.c === --- gcc/testsuite/gcc.dg/strlenopt-9.c (revision 208772) +++ gcc/testsuite/gcc.dg/strlenopt-9.c (working copy) @@ -11,21 +11,21 @@ fn1 (int r)
Re: calloc = malloc + memset
On Fri, Feb 28, 2014 at 11:48 PM, Marc Glisse marc.gli...@inria.fr wrote: Hello, this is a stage 1 patch, and I'll ping it then, but if you have comments now... Passes bootstrap+testsuite on x86_64-linux-gnu. 2014-02-28 Marc Glisse marc.gli...@inria.fr PR tree-optimization/57742 gcc/ * tree-ssa-forwprop.c (simplify_malloc_memset): New function. (simplify_builtin_call): Call it. gcc/testsuite/ * g++.dg/tree-ssa/calloc.C: New testcase. * gcc.dg/tree-ssa/calloc.c: Likewise. -- Marc Glisse Index: gcc/testsuite/g++.dg/tree-ssa/calloc.C === --- gcc/testsuite/g++.dg/tree-ssa/calloc.C (revision 0) +++ gcc/testsuite/g++.dg/tree-ssa/calloc.C (working copy) @@ -0,0 +1,35 @@ +/* { dg-do compile } */ +/* { dg-options -std=gnu++11 -O3 -fdump-tree-optimized } */ + +#include new +#include vector +#include cstdlib + +void g(void*); +inline void* operator new(std::size_t sz) _GLIBCXX_THROW (std::bad_alloc) +{ + void *p; + + if (sz == 0) +sz = 1; + + // Slightly modified from the libsupc++ version, that one has 2 calls + // to malloc which makes it too hard to optimize. + while ((p = std::malloc (sz)) == 0) +{ + std::new_handler handler = std::get_new_handler (); + if (! handler) +_GLIBCXX_THROW_OR_ABORT(std::bad_alloc()); + handler (); +} + return p; +} + +void f(void*p,int n){ + new(p)std::vectorint(n); +} + +/* { dg-final { scan-tree-dump-times calloc 1 optimized } } */ +/* { dg-final { scan-tree-dump-not malloc optimized } } */ +/* { dg-final { scan-tree-dump-not memset optimized } } */ +/* { dg-final { cleanup-tree-dump optimized } } */ Property changes on: gcc/testsuite/g++.dg/tree-ssa/calloc.C ___ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +Author Date Id Revision URL \ No newline at end of property Index: gcc/testsuite/gcc.dg/tree-ssa/calloc.c === --- gcc/testsuite/gcc.dg/tree-ssa/calloc.c (revision 0) +++ gcc/testsuite/gcc.dg/tree-ssa/calloc.c (working copy) @@ -0,0 +1,29 @@ +/* { dg-do compile } */ +/* { dg-options -O2 -fdump-tree-optimized } */ + +#include stdlib.h +#include string.h + +extern int a; +extern int* b; +int n; +void* f(long*q){ + int*p=malloc(n); + ++*q; + if(p){ +++*q; +a=2; +memset(p,0,n); +*b=3; + } + return p; +} +void* g(void){ + float*p=calloc(8,4); + return memset(p,0,32); +} + +/* { dg-final { scan-tree-dump-times calloc 2 optimized } } */ +/* { dg-final { scan-tree-dump-not malloc optimized } } */ +/* { dg-final { scan-tree-dump-not memset optimized } } */ +/* { dg-final { cleanup-tree-dump optimized } } */ Property changes on: gcc/testsuite/gcc.dg/tree-ssa/calloc.c ___ Added: svn:keywords ## -0,0 +1 ## +Author Date Id Revision URL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: gcc/tree-ssa-forwprop.c === --- gcc/tree-ssa-forwprop.c (revision 208224) +++ gcc/tree-ssa-forwprop.c (working copy) @@ -1487,20 +1487,149 @@ constant_pointer_difference (tree p1, tr } for (i = 0; i cnt[0]; i++) for (j = 0; j cnt[1]; j++) if (exps[0][i] == exps[1][j]) return size_binop (MINUS_EXPR, offs[0][i], offs[1][j]); return NULL_TREE; } +/* Optimize + ptr = malloc (n); + memset (ptr, 0, n); + into + ptr = calloc (n); + gsi_p is known to point to a call to __builtin_memset. */ +static bool +simplify_malloc_memset (gimple_stmt_iterator *gsi_p) +{ + /* First make sure we have: + ptr = malloc (n); + memset (ptr, 0, n); */ + gimple stmt2 = gsi_stmt (*gsi_p); + if (!integer_zerop (gimple_call_arg (stmt2, 1))) +return false; + tree ptr1, ptr2 = gimple_call_arg (stmt2, 0); + tree size = gimple_call_arg (stmt2, 2); + if (TREE_CODE (ptr2) != SSA_NAME) +return false; + gimple stmt1 = SSA_NAME_DEF_STMT (ptr2); + tree callee1; + /* Handle the case where STMT1 is a unary PHI, which happends + for instance with: + while (!(p = malloc (n))) { ... } + memset (p, 0, n); */ + if (!stmt1) +return false; + if (gimple_code (stmt1) == GIMPLE_PHI + gimple_phi_num_args (stmt1) == 1) +{ + ptr1 = gimple_phi_arg_def (stmt1, 0); + if (TREE_CODE (ptr1) != SSA_NAME) + return false; + stmt1 = SSA_NAME_DEF_STMT (ptr1); +} + else +ptr1 = ptr2; + if (!stmt1 + || !is_gimple_call (stmt1) + || !(callee1 = gimple_call_fndecl (stmt1))) +return false; That's a bit
Re: calloc = malloc + memset
On Mon, 3 Mar 2014, Richard Biener wrote: That's a bit much of ad-hoc pattern-matching ... wouldn't be p = malloc (n); memset (p, 0, n); transform better suited to the strlen opt pass? After all that tracks what 'string' is associated with a SSA name pointer through arbitrary satements using a lattice. Too early, it needs to run later than ldist, or there won't be any memset to match in the std::vector case. Would you consider moving or duplicating either strlen or ldist so they are run in the order I need? The same probably applies to calloc(); memset (, 0,); Oh, you mean the length doesn't have to match for calloc? That's true, I completely missed that. though here you could even match points-to info (after all even only clearing a portion of the calloc()ed memory is dead code). If points-to conservatively computes that the memset pointer only points to null or the memory tag the calloc return value points to then you can discard it without further checking ... I'll look into it (and DSE). Note that the calloc case is just an afterthought, what I really care about is replacing malloc. + /* Finally, make sure the memory is not used before stmt2. */ + ao_ref ref; + ao_ref_init_from_ptr_and_size (ref, ptr1, size); + tree vdef = gimple_vuse (stmt2); + if (vdef == NULL) +return false; + while (true) +{ + gimple cur = SSA_NAME_DEF_STMT (vdef); + if (cur == stmt1) break; + if (stmt_may_clobber_ref_p_1 (cur, ref)) + return false; + vdef = gimple_vuse (cur); +} We have walk_aliased_vdefs() for this. As explained in the PR, walk_aliased_vdefs misses the call to malloc (it doesn't clobber the memory pointed to by p). You then suggested: Exact pattern matching of the CFG involved might be the easiest, plus manually implementing walk_aliased_vdefs by simply walking the use-def chain of the virtual operands from the memset operation to the malloc and checking stmt_may_clobber_ref_p_1 on the ao_ref_init_from_ptr_and_size ref. That said, please try to integrate this kind of transforms with the strlen opt pass (even if it requires making its lattice more generic). Assuming the passes have a chance of being reordered, I'll try to understand how strlen works. Thanks for the comments, -- Marc Glisse
Re: calloc = malloc + memset
Hi On 28/feb/2014, at 23:48, Marc Glisse marc.gli...@inria.fr wrote: Hello, this is a stage 1 patch, and I'll ping it then, but if you have comments now... Passes bootstrap+testsuite on x86_64-linux-gnu. 2014-02-28 Marc Glisse marc.gli...@inria.fr PR tree-optimization/57742 gcc/ * tree-ssa-forwprop.c (simplify_malloc_memset): New function. (simplify_builtin_call): Call it. gcc/testsuite/ * g++.dg/tree-ssa/calloc.C: New testcase. * gcc.dg/tree-ssa/calloc.c: Likewise. -- Marc Glisse Index: gcc/testsuite/g++.dg/tree-ssa/calloc.C === --- gcc/testsuite/g++.dg/tree-ssa/calloc.C(revision 0) +++ gcc/testsuite/g++.dg/tree-ssa/calloc.C(working copy) @@ -0,0 +1,35 @@ +/* { dg-do compile } */ +/* { dg-options -std=gnu++11 -O3 -fdump-tree-optimized } */ + +#include new +#include vector +#include cstdlib + +void g(void*); +inline void* operator new(std::size_t sz) _GLIBCXX_THROW (std::bad_alloc) Unless *really* necessary I would recommend not including the large vector (that also couples quite seriously the front-end testsuite to the library testsuite, we already discussed those topics in the past). Using the internal macros seems also unnecessary. Paolo
Re: calloc = malloc + memset
On Sat, 1 Mar 2014, Paolo Carlini wrote: Hi On 28/feb/2014, at 23:48, Marc Glisse marc.gli...@inria.fr wrote: Hello, this is a stage 1 patch, and I'll ping it then, but if you have comments now... Passes bootstrap+testsuite on x86_64-linux-gnu. 2014-02-28 Marc Glisse marc.gli...@inria.fr PR tree-optimization/57742 gcc/ * tree-ssa-forwprop.c (simplify_malloc_memset): New function. (simplify_builtin_call): Call it. gcc/testsuite/ * g++.dg/tree-ssa/calloc.C: New testcase. * gcc.dg/tree-ssa/calloc.c: Likewise. -- Marc Glisse Index: gcc/testsuite/g++.dg/tree-ssa/calloc.C === --- gcc/testsuite/g++.dg/tree-ssa/calloc.C(revision 0) +++ gcc/testsuite/g++.dg/tree-ssa/calloc.C(working copy) @@ -0,0 +1,35 @@ +/* { dg-do compile } */ +/* { dg-options -std=gnu++11 -O3 -fdump-tree-optimized } */ + +#include new +#include vector +#include cstdlib + +void g(void*); +inline void* operator new(std::size_t sz) _GLIBCXX_THROW (std::bad_alloc) Unless *really* necessary I would recommend not including the large vector (that also couples quite seriously the front-end testsuite to the library testsuite, we already discussed those topics in the past). Using the internal macros seems also unnecessary. I think it might be the first time I include large headers in a compiler testcase (note that there are already 16 other testcases including vector in g++.dg). In this case, it seems to be what I want to test though. I already have some elementary tests in gcc.dg. This testcase is the original motivation for working on this. It requires a combination of quite a few optimizations (inlining, recognizing that a loop is a memset, aliasing, this optimization (the complicated version with a PHI node)), and I want to test that we won't for instance shuffle the passes in a way that breaks it. Also, if the library changes vector enough that this doesn't optimize anymore, I want to know about it, either the library change was wrong or the middle-end needs to improve some optimization before the next release. I wanted to keep the implementation of new as close to the one in libsupc++ as possible (mimic LTO), so I copy-pasted (and slightly edited, I may propose a patch to libsupc++ later). I agree that I should remove the exception specification (since I am compiling in C++11 to have access to get_new_handler) and replace _GLIBCXX_THROW_OR_ABORT with just throw, and I just did it locally, thanks. -- Marc Glisse
calloc = malloc + memset
Hello, this is a stage 1 patch, and I'll ping it then, but if you have comments now... Passes bootstrap+testsuite on x86_64-linux-gnu. 2014-02-28 Marc Glisse marc.gli...@inria.fr PR tree-optimization/57742 gcc/ * tree-ssa-forwprop.c (simplify_malloc_memset): New function. (simplify_builtin_call): Call it. gcc/testsuite/ * g++.dg/tree-ssa/calloc.C: New testcase. * gcc.dg/tree-ssa/calloc.c: Likewise. -- Marc GlisseIndex: gcc/testsuite/g++.dg/tree-ssa/calloc.C === --- gcc/testsuite/g++.dg/tree-ssa/calloc.C (revision 0) +++ gcc/testsuite/g++.dg/tree-ssa/calloc.C (working copy) @@ -0,0 +1,35 @@ +/* { dg-do compile } */ +/* { dg-options -std=gnu++11 -O3 -fdump-tree-optimized } */ + +#include new +#include vector +#include cstdlib + +void g(void*); +inline void* operator new(std::size_t sz) _GLIBCXX_THROW (std::bad_alloc) +{ + void *p; + + if (sz == 0) +sz = 1; + + // Slightly modified from the libsupc++ version, that one has 2 calls + // to malloc which makes it too hard to optimize. + while ((p = std::malloc (sz)) == 0) +{ + std::new_handler handler = std::get_new_handler (); + if (! handler) +_GLIBCXX_THROW_OR_ABORT(std::bad_alloc()); + handler (); +} + return p; +} + +void f(void*p,int n){ + new(p)std::vectorint(n); +} + +/* { dg-final { scan-tree-dump-times calloc 1 optimized } } */ +/* { dg-final { scan-tree-dump-not malloc optimized } } */ +/* { dg-final { scan-tree-dump-not memset optimized } } */ +/* { dg-final { cleanup-tree-dump optimized } } */ Property changes on: gcc/testsuite/g++.dg/tree-ssa/calloc.C ___ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +Author Date Id Revision URL \ No newline at end of property Index: gcc/testsuite/gcc.dg/tree-ssa/calloc.c === --- gcc/testsuite/gcc.dg/tree-ssa/calloc.c (revision 0) +++ gcc/testsuite/gcc.dg/tree-ssa/calloc.c (working copy) @@ -0,0 +1,29 @@ +/* { dg-do compile } */ +/* { dg-options -O2 -fdump-tree-optimized } */ + +#include stdlib.h +#include string.h + +extern int a; +extern int* b; +int n; +void* f(long*q){ + int*p=malloc(n); + ++*q; + if(p){ +++*q; +a=2; +memset(p,0,n); +*b=3; + } + return p; +} +void* g(void){ + float*p=calloc(8,4); + return memset(p,0,32); +} + +/* { dg-final { scan-tree-dump-times calloc 2 optimized } } */ +/* { dg-final { scan-tree-dump-not malloc optimized } } */ +/* { dg-final { scan-tree-dump-not memset optimized } } */ +/* { dg-final { cleanup-tree-dump optimized } } */ Property changes on: gcc/testsuite/gcc.dg/tree-ssa/calloc.c ___ Added: svn:keywords ## -0,0 +1 ## +Author Date Id Revision URL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: gcc/tree-ssa-forwprop.c === --- gcc/tree-ssa-forwprop.c (revision 208224) +++ gcc/tree-ssa-forwprop.c (working copy) @@ -1487,20 +1487,149 @@ constant_pointer_difference (tree p1, tr } for (i = 0; i cnt[0]; i++) for (j = 0; j cnt[1]; j++) if (exps[0][i] == exps[1][j]) return size_binop (MINUS_EXPR, offs[0][i], offs[1][j]); return NULL_TREE; } +/* Optimize + ptr = malloc (n); + memset (ptr, 0, n); + into + ptr = calloc (n); + gsi_p is known to point to a call to __builtin_memset. */ +static bool +simplify_malloc_memset (gimple_stmt_iterator *gsi_p) +{ + /* First make sure we have: + ptr = malloc (n); + memset (ptr, 0, n); */ + gimple stmt2 = gsi_stmt (*gsi_p); + if (!integer_zerop (gimple_call_arg (stmt2, 1))) +return false; + tree ptr1, ptr2 = gimple_call_arg (stmt2, 0); + tree size = gimple_call_arg (stmt2, 2); + if (TREE_CODE (ptr2) != SSA_NAME) +return false; + gimple stmt1 = SSA_NAME_DEF_STMT (ptr2); + tree callee1; + /* Handle the case where STMT1 is a unary PHI, which happends + for instance with: + while (!(p = malloc (n))) { ... } + memset (p, 0, n); */ + if (!stmt1) +return false; + if (gimple_code (stmt1) == GIMPLE_PHI + gimple_phi_num_args (stmt1) == 1) +{ + ptr1 = gimple_phi_arg_def (stmt1, 0); + if (TREE_CODE (ptr1) != SSA_NAME) + return false; + stmt1 = SSA_NAME_DEF_STMT (ptr1); +} + else +ptr1 = ptr2; + if (!stmt1 + || !is_gimple_call (stmt1) + || !(callee1 = gimple_call_fndecl (stmt1))) +return false; + + bool is_calloc; + if (DECL_FUNCTION_CODE (callee1) == BUILT_IN_MALLOC) +{ + is_calloc = false; + if (!operand_equal_p (gimple_call_arg (stmt1, 0), size, 0)) + return false; +} + else if (DECL_FUNCTION_CODE