On Tue, Oct 18, 2011 at 4:14 PM, William J. Schmidt <wschm...@linux.vnet.ibm.com> wrote: > Greetings, > > Here is a new revision of the tree portions of this patch. I moved the > pattern recognizer to expand, and added additional logic to look for the > same pattern in gimple form. I added two more tests to verify the new > logic. > > I didn't run into any problems with the RTL CSE phases. I can't recall > for certain what caused me to abandon the expand version previously. > There may not have been good reason; too many versions to keep track of > and too many interruptions, I suppose. In any case, I'm much happier > having this code in the expander. > > Paolo's RTL logic for unpropagating the zero-offset case is not going to > work out as is. It causes a number of performance degradations, which I > suspect are due to the pass reordering. That's a separate issue, > though, and not needed for this patch. > > Bootstrapped and regression-tested on powerpc64-linux. SPEC cpu2000 > shows a number of small improvements and no significant degradations. > SPEC cpu2006 testing is pending. > > Thanks, > Bill > > > 2011-10-18 Bill Schmidt <wschm...@linux.vnet.ibm.com> > > gcc: > > PR rtl-optimization/46556 > * expr.c (tree-pretty-print.h): New include. > (restructure_base_and_offset): New function. > (restructure_mem_ref): New function. > (expand_expr_real_1): In MEM_REF case, attempt restructure_mem_ref > first. In normal_inner_ref case, attempt restructure_base_and_offset > first. > * Makefile.in: Update dependences for expr.o. > > gcc/testsuite: > > PR rtl-optimization/46556 > * gcc.dg/tree-ssa-pr46556-1.c: New test. > * gcc.dg/tree-ssa-pr46556-2.c: Likewise. > * gcc.dg/tree-ssa-pr46556-3.c: Likewise. > * gcc.dg/tree-ssa-pr46556-4.c: Likewise. > * gcc.dg/tree-ssa-pr46556-5.c: Likewise. > > > Index: gcc/testsuite/gcc.dg/tree-ssa/pr46556-1.c > =================================================================== > --- gcc/testsuite/gcc.dg/tree-ssa/pr46556-1.c (revision 0) > +++ gcc/testsuite/gcc.dg/tree-ssa/pr46556-1.c (revision 0) > @@ -0,0 +1,22 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-rtl-expand" } */ > + > +struct x > +{ > + int a[16]; > + int b[16]; > + int c[16]; > +}; > + > +extern void foo (int, int, int); > + > +void > +f (struct x *p, unsigned int n) > +{ > + foo (p->a[n], p->c[n], p->b[n]); > +} > + > +/* { dg-final { scan-rtl-dump-times "\\(mem/s:SI \\(plus:" 2 "expand" } } */ > +/* { dg-final { scan-rtl-dump-times "const_int 128" 1 "expand" } } */ > +/* { dg-final { scan-rtl-dump-times "const_int 64 \\\[0x40\\\]\\)\\) \\\[" 1 > "expand" } } */ > +/* { dg-final { cleanup-rtl-dump "expand" } } */ > Index: gcc/testsuite/gcc.dg/tree-ssa/pr46556-2.c > =================================================================== > --- gcc/testsuite/gcc.dg/tree-ssa/pr46556-2.c (revision 0) > +++ gcc/testsuite/gcc.dg/tree-ssa/pr46556-2.c (revision 0) > @@ -0,0 +1,26 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-rtl-expand" } */ > + > +struct x > +{ > + int a[16]; > + int b[16]; > + int c[16]; > +}; > + > +extern void foo (int, int, int); > + > +void > +f (struct x *p, unsigned int n) > +{ > + foo (p->a[n], p->c[n], p->b[n]); > + if (n > 12) > + foo (p->a[n], p->c[n], p->b[n]); > + else if (n > 3) > + foo (p->b[n], p->a[n], p->c[n]); > +} > + > +/* { dg-final { scan-rtl-dump-times "\\(mem/s:SI \\(plus:" 6 "expand" } } */ > +/* { dg-final { scan-rtl-dump-times "const_int 128" 3 "expand" } } */ > +/* { dg-final { scan-rtl-dump-times "const_int 64 \\\[0x40\\\]\\)\\) \\\[" 3 > "expand" } } */ > +/* { dg-final { cleanup-rtl-dump "expand" } } */ > Index: gcc/testsuite/gcc.dg/tree-ssa/pr46556-3.c > =================================================================== > --- gcc/testsuite/gcc.dg/tree-ssa/pr46556-3.c (revision 0) > +++ gcc/testsuite/gcc.dg/tree-ssa/pr46556-3.c (revision 0) > @@ -0,0 +1,27 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-rtl-expand" } */ > +struct x > +{ > + int a[16]; > + int b[16]; > + int c[16]; > +}; > + > +extern void foo (int, int, int); > + > +void > +f (struct x *p, unsigned int n) > +{ > + foo (p->a[n], p->c[n], p->b[n]); > + if (n > 3) > + { > + foo (p->a[n], p->c[n], p->b[n]); > + if (n > 12) > + foo (p->b[n], p->a[n], p->c[n]); > + } > +} > + > +/* { dg-final { scan-rtl-dump-times "\\(mem/s:SI \\(plus:" 6 "expand" } } */ > +/* { dg-final { scan-rtl-dump-times "const_int 128" 3 "expand" } } */ > +/* { dg-final { scan-rtl-dump-times "const_int 64 \\\[0x40\\\]\\)\\) \\\[" 3 > "expand" } } */ > +/* { dg-final { cleanup-rtl-dump "expand" } } */ > Index: gcc/testsuite/gcc.dg/tree-ssa/pr46556-4.c > =================================================================== > --- gcc/testsuite/gcc.dg/tree-ssa/pr46556-4.c (revision 0) > +++ gcc/testsuite/gcc.dg/tree-ssa/pr46556-4.c (revision 0) > @@ -0,0 +1,28 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-rtl-expand" } */ > + > +struct x > +{ > + int a[16]; > + int b[16]; > + int c[16]; > +}; > + > +extern void foo (int, int, int); > + > +void > +f (struct x *p, unsigned int n) > +{ > + foo (*((int *)p + n*4), *((int *)p + 32 + n*4), *((int *)p + 16 + n*4)); > + if (n > 3) > + { > + foo (*((int *)p + n*4), *((int *)p + 32 + n*4), *((int *)p + 16 + > n*4)); > + if (n > 12) > + foo (*((int *)p + 16 + n*4), *((int *)p + n*4), *((int *)p + 32 + > n*4)); > + } > +} > + > +/* { dg-final { scan-rtl-dump-times "\\(mem:SI \\(plus:" 6 "expand" } } */ > +/* { dg-final { scan-rtl-dump-times "const_int 128" 3 "expand" } } */ > +/* { dg-final { scan-rtl-dump-times "const_int 64 \\\[0x40\\\]\\)\\) \\\[" 3 > "expand" } } */ > +/* { dg-final { cleanup-rtl-dump "expand" } } */ > Index: gcc/testsuite/gcc.dg/tree-ssa/pr46556-5.c > =================================================================== > --- gcc/testsuite/gcc.dg/tree-ssa/pr46556-5.c (revision 0) > +++ gcc/testsuite/gcc.dg/tree-ssa/pr46556-5.c (revision 0) > @@ -0,0 +1,36 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-rtl-expand" } */ > +/* { dg-skip-if "" { lp64 } { "*" } { "-m32" } } */ > +/* { dg-skip-if "" { llp64 } { "*" } { "-m32" } } */ > + > +struct x > +{ > + int a[16]; > + int b[16]; > + int c[16]; > +}; > + > +extern void foo (int, int, int); > + > +void > +f (struct x *p, unsigned int n) > +{ > + foo (*((int *)p + n * 4), > + *((int *)p + (n + 8) * 4), > + *((int *)p + (n + 4) * 4)); > + if (n > 3) > + { > + foo (*((int *)p + n * 4), > + *((int *)p + (n + 8) * 4), > + *((int *)p + (n + 4) * 4)); > + if (n > 12) > + foo (*((int *)p + (n + 4) * 4), > + *((int *)p + n * 4), > + *((int *)p + (n + 8) * 4)); > + } > +} > + > +/* { dg-final { scan-rtl-dump-times "\\(mem:SI \\(plus:" 6 "expand" } } */ > +/* { dg-final { scan-rtl-dump-times "const_int 128" 3 "expand" } } */ > +/* { dg-final { scan-rtl-dump-times "const_int 64 \\\[0x40\\\]\\)\\) \\\[" 3 > "expand" } } */ > +/* { dg-final { cleanup-rtl-dump "expand" } } */ > Index: gcc/expr.c > =================================================================== > --- gcc/expr.c (revision 180138) > +++ gcc/expr.c (working copy) > @@ -56,6 +56,7 @@ along with GCC; see the file COPYING3. If not see > #include "ssaexpand.h" > #include "target-globals.h" > #include "params.h" > +#include "tree-pretty-print.h" > > /* Decide whether a function's arguments should be processed > from first to last or from last to first. > @@ -7648,7 +7649,157 @@ expand_constructor (tree exp, rtx target, enum exp > return target; > } > > +/* Given BASE, OFFSET, and BITPOS derived from EXPR, determine whether > + there is a profitable opportunity to restructure address arithmetic > + within BASE and OFFSET. If so, produce such a restructuring and > + return it. */ > +/* TODO: This belongs more properly in a separate pass that performs > + general strength reduction on straight-line code. Eventually move > + this there. */ > > +static tree > +restructure_base_and_offset (tree expr, tree base, tree offset, > + HOST_WIDE_INT bitpos) > +{ > + tree mult_op0, mult_op1, t1, t2, offset_type, mult_expr, add_expr, mem_ref; > + double_int c, c1, c2, c3, c4; > + > + /* Look for the following pattern: > + > + base = MEM_REF (T1, C1); > + offset = MULT_EXPR (PLUS_EXPR (T2, C2), C3) > + bitpos = C4 * BITS_PER_UNIT > + > + If found, convert it to: > + > + MEM_REF (POINTER_PLUS_EXPR (T1, MULT_EXPR (T2, C3)), > + C1 + (C2 * C3) + C4) > + */ > + if (!base > + || !offset > + || TREE_CODE (base) != MEM_REF > + || TREE_CODE (offset) != MULT_EXPR) > + return NULL_TREE; > + > + mult_op0 = TREE_OPERAND (offset, 0); > + mult_op1 = TREE_OPERAND (offset, 1); > + > + if (TREE_CODE (mult_op0) != PLUS_EXPR > + || TREE_CODE (mult_op1) != INTEGER_CST > + || TREE_CODE (TREE_OPERAND (mult_op0, 1)) != INTEGER_CST) > + return NULL_TREE; > + > + t1 = TREE_OPERAND (base, 0); > + t2 = TREE_OPERAND (mult_op0, 0); > + c1 = mem_ref_offset (base); > + c2 = tree_to_double_int (TREE_OPERAND (mult_op0, 1)); > + c3 = tree_to_double_int (mult_op1); > + c4 = uhwi_to_double_int (bitpos / BITS_PER_UNIT); > + c = double_int_add (double_int_add (c1, double_int_mul (c2, c3)), c4); > + > + offset_type = TREE_TYPE (TREE_OPERAND (base, 1)); > + > + mult_expr = fold_build2 (MULT_EXPR, sizetype, t2, > + double_int_to_tree (sizetype, c3)); > + > + add_expr = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (t1), t1, mult_expr); > + > + mem_ref = fold_build2 (MEM_REF, TREE_TYPE (expr), add_expr, > + double_int_to_tree (offset_type, c)); > + return mem_ref; > +} > + > +/* Look for a hidden array reference in memory reference EXP. If found, > + return a revised memory reference that restructures addressability > + to combine constants. */ > +/* TODO: This belongs more properly in a separate pass that performs > + general strength reduction on straight-line code. Eventually move > + this there. */ > + > +static tree > +restructure_mem_ref (tree exp) > +{ > + tree s2, s3, t1, t2, offset_type, mult_expr, add_expr; > + tree s2_op0, s2_op1, s3_op0, s3_op1, result; > + gimple s1_stmt, s2_stmt, s3_stmt; > + double_int c1, c2, c3, c; > + > + /* Currently we look for the following pattern, where each Si is an > + SSA name, each Tj is an arbitrary tree, and each Ck is an integer > + constant: > + > + S3 = T2 + C2; > + S2 = S3 * C3; > + S1 = T1 ptr+ S2; > + exp = MEM_REF (S1, C1); > + > + This is rewritten as: > + > + exp = MEM_REF (T1 ptr+ (T2 * C3)), C1 + C2 * C3); > + > + */ > + if (TREE_CODE (TREE_OPERAND (exp, 0)) != SSA_NAME) > + return NULL_TREE; > + > + /* We don't use get_def_for_expr for S1 because TER doesn't forward > + S1 in some situations where this transform is useful, such as > + when S1 is the base of two MEM_REFs fitting the pattern. */ > + s1_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (exp, 0));
You can't do this - this will possibly generate wrong code. You _do_ have to use get_def_for_expr. Or do it when we are still in "true" SSA form... Richard. > + if (!s1_stmt > + || !is_gimple_assign (s1_stmt) > + || gimple_assign_rhs_code (s1_stmt) != POINTER_PLUS_EXPR) > + return NULL_TREE; > + > + c1 = mem_ref_offset (exp); > + t1 = gimple_assign_rhs1 (s1_stmt); > + s2 = gimple_assign_rhs2 (s1_stmt); > + > + if (TREE_CODE (s2) != SSA_NAME) > + return NULL_TREE; > + > + s2_stmt = get_def_for_expr (s2, MULT_EXPR); > + > + if (!s2_stmt) > + return NULL_TREE; > + > + s2_op0 = gimple_assign_rhs1 (s2_stmt); > + s2_op1 = gimple_assign_rhs2 (s2_stmt); > + > + if (TREE_CODE (s2_op1) != INTEGER_CST) > + return NULL_TREE; > + > + s3 = s2_op0; > + c3 = tree_to_double_int (s2_op1); > + s3_stmt = get_def_for_expr (s3, PLUS_EXPR); > + > + if (!s3_stmt) > + return NULL_TREE; > + > + s3_op0 = gimple_assign_rhs1 (s3_stmt); > + s3_op1 = gimple_assign_rhs2 (s3_stmt); > + > + if (TREE_CODE (s3_op1) != INTEGER_CST) > + return NULL_TREE; > + > + t2 = s3_op0; > + c2 = tree_to_double_int (s3_op1); > + c = double_int_add (c1, double_int_mul (c2, c3)); > + > + offset_type = TREE_TYPE (TREE_OPERAND (exp, 1)); > + > + mult_expr = fold_build2 (MULT_EXPR, sizetype, t2, > + double_int_to_tree (sizetype, c3)); > + > + add_expr = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (t1), t1, mult_expr); > + > + result = build2 (MEM_REF, TREE_TYPE (exp), add_expr, > + double_int_to_tree (offset_type, c)); > + > + return result; > +} > + > + > /* expand_expr: generate code for computing expression EXP. > An rtx for the computed value is returned. The value is never null. > In the case of a void EXP, const0_rtx is returned. > @@ -9313,6 +9464,28 @@ expand_expr_real_1 (tree exp, rtx target, enum mac > gimple def_stmt; > enum insn_code icode; > unsigned align; > + tree tem; > + > + /* Attempt to restructure a hidden array reference, such > + as *(p + (n + k) * s). This generates dead code, so > + require -O2. */ > + if (optimize > 1 > + && mode != BLKmode > + && ((tem = restructure_mem_ref (exp)) != NULL_TREE)) > + { > + if (dump_file && (dump_flags & TDF_DETAILS)) > + { > + fprintf (dump_file, "\nRestructured memory reference:\n"); > + fprintf (dump_file, " Original reference: "); > + print_generic_expr (dump_file, exp, TDF_VOPS|TDF_MEMSYMS); > + fprintf (dump_file, "\n Replacement: "); > + print_generic_expr (dump_file, tem, TDF_VOPS|TDF_MEMSYMS); > + } > + copy_ref_info (tem, exp); > + exp = tem; > + base = TREE_OPERAND (exp, 0); > + } > + > /* Handle expansion of non-aliased memory with non-BLKmode. That > might end up in a register. */ > if (TREE_CODE (base) == ADDR_EXPR) > @@ -9584,7 +9757,7 @@ expand_expr_real_1 (tree exp, rtx target, enum mac > { > enum machine_mode mode1, mode2; > HOST_WIDE_INT bitsize, bitpos; > - tree offset; > + tree offset, tem2; > int volatilep = 0, must_force_mem; > bool packedp = false; > tree tem = get_inner_reference (exp, &bitsize, &bitpos, &offset, > @@ -9601,6 +9774,36 @@ expand_expr_real_1 (tree exp, rtx target, enum mac > && DECL_PACKED (TREE_OPERAND (exp, 1)))) > packedp = true; > > + /* Attempt to restructure the base and offset to combine constants. > + Bitfield references should eventually be lowered prior to this > + point and are not dealt with. Skip BLKmode aggregates as well. > + This generates dead code, so require -O2. */ > + if (optimize > 1 > + && code != BIT_FIELD_REF > + && (code != COMPONENT_REF > + || !DECL_BIT_FIELD (TREE_OPERAND (exp, 1))) > + && TYPE_MODE (TREE_TYPE (exp)) != BLKmode > + && bitpos % BITS_PER_UNIT == 0 > + && ((tem2 = restructure_base_and_offset (exp, tem, offset, > bitpos)) > + != NULL_TREE)) > + { > + copy_ref_info (tem2, exp); > + tem = tem2; > + /* The offset and bitpos were absorbed into the new MEM_REF, so > + make sure we don't add them in again. */ > + offset = NULL_TREE; > + bitpos = 0; > + > + if (dump_file && (dump_flags & TDF_DETAILS)) > + { > + fprintf (dump_file, "\nRestructured inner reference:\n"); > + fprintf (dump_file, " Original reference: "); > + print_generic_expr (dump_file, exp, TDF_VOPS|TDF_MEMSYMS); > + fprintf (dump_file, "\n Replacement: "); > + print_generic_expr (dump_file, tem, TDF_VOPS|TDF_MEMSYMS); > + } > + } > + > /* If TEM's type is a union of variable size, pass TARGET to the inner > computation, since it will need a temporary and TARGET is known > to have to do. This occurs in unchecked conversion in Ada. */ > Index: gcc/Makefile.in > =================================================================== > --- gcc/Makefile.in (revision 180138) > +++ gcc/Makefile.in (working copy) > @@ -2926,7 +2926,7 @@ expr.o : expr.c $(CONFIG_H) $(SYSTEM_H) coretypes. > reload.h langhooks.h intl.h $(TM_P_H) $(TARGET_H) \ > tree-iterator.h gt-expr.h $(MACHMODE_H) $(TIMEVAR_H) $(TREE_FLOW_H) \ > $(TREE_PASS_H) $(DF_H) $(DIAGNOSTIC_H) vecprim.h $(SSAEXPAND_H) \ > - $(PARAMS_H) $(COMMON_TARGET_H) > + $(PARAMS_H) $(COMMON_TARGET_H) tree-pretty-print.h > dojump.o : dojump.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) $(RTL_H) > $(TREE_H) \ > $(FLAGS_H) $(FUNCTION_H) $(EXPR_H) $(OPTABS_H) $(INSN_ATTR_H) > insn-config.h \ > langhooks.h $(GGC_H) gt-dojump.h vecprim.h $(BASIC_BLOCK_H) output.h > > >