This teaches loop distribution to distribute nested loops.  I plan
to commit the trivial bits of it but not the rest of the patch
until I have an idea how to best limit the loop nest walk
(it tries distributing nests from outer to inner loops, re-doing
dependence analysis and RDG build).

At this point loop distribution needs a better cost model,
the ability to turn flow dependences into data dependences and
turning data dependences into partition ordering dependences.

Still the first thing for me to tackle is some more patterns to recognize.

Bootstrapped with -ftree-loop-distribution and tested on 
x86_64-unknown-linux-gnu.

Richard.

2013-09-17  Richard Biener  <rguent...@suse.de>

        * tree-loop-distribution.c (ssa_name_has_uses_outside_loop_p):
        Properly handle loop nests.
        (classify_partition): Disable builtins for loop nests.
        (similar_memory_accesses): Refine cost model.
        (distribute_loop): Dump which loop we are trying to distribute.
        (tree_loop_distribution): Handle distribution of nested loops.

        * gfortran.dg/ldist-2.f: New testcase.
        * gcc.dg/tree-ssa/ldist-5.c: Adjust XFAIL reason.

Index: trunk/gcc/testsuite/gfortran.dg/ldist-2.f
===================================================================
*** /dev/null   1970-01-01 00:00:00.000000000 +0000
--- trunk/gcc/testsuite/gfortran.dg/ldist-2.f   2013-09-17 13:42:22.144740768 
+0200
***************
*** 0 ****
--- 1,64 ----
+ ! { dg-do compile }
+ ! { dg-options "-O3 -fno-tree-loop-im -ftree-loop-distribution 
-fdump-tree-ldist-details" }
+ 
+ ! Testcase from bwaves block_solver.f
+         subroutine mat_times_vec(y,x,a,axp,ayp,azp,axm,aym,azm,
+      $  nb,nx,ny,nz)
+         implicit none
+         integer nb,nx,ny,nz,i,j,k,m,l,kit,im1,ip1,jm1,jp1,km1,kp1
+ 
+         real*8 y(nb,nx,ny,nz),x(nb,nx,ny,nz)
+ 
+         real*8 a(nb,nb,nx,ny,nz),
+      1  axp(nb,nb,nx,ny,nz),ayp(nb,nb,nx,ny,nz),azp(nb,nb,nx,ny,nz),
+      2  axm(nb,nb,nx,ny,nz),aym(nb,nb,nx,ny,nz),azm(nb,nb,nx,ny,nz)
+ 
+ 
+       do k=1,nz
+ c         do j=1,ny
+ c            do i=1,nx
+ c               do l=1,nb
+ c                  y(l,i,j,k)=0.0d0
+ c               enddo
+ c            enddo
+ c         enddo
+ 
+          km1=mod(k+nz-2,nz)+1
+          kp1=mod(k,nz)+1
+          do j=1,ny
+             jm1=mod(j+ny-2,ny)+1
+             jp1=mod(j,ny)+1
+             do i=1,nx
+                im1=mod(i+nx-2,nx)+1
+                ip1=mod(i,nx)+1
+                do l=1,nb
+                   y(l,i,j,k)=0.0d0
+                   do m=1,nb
+                      y(l,i,j,k)=y(l,i,j,k)+
+      1               a(l,m,i,j,k)*x(m,i,j,k)+
+      2               axp(l,m,i,j,k)*x(m,ip1,j,k)+
+      3               ayp(l,m,i,j,k)*x(m,i,jp1,k)+
+      4               azp(l,m,i,j,k)*x(m,i,j,kp1)+
+      5               axm(l,m,i,j,k)*x(m,im1,j,k)+
+      6               aym(l,m,i,j,k)*x(m,i,jm1,k)+
+      7               azm(l,m,i,j,k)*x(m,i,j,km1)
+                   enddo
+                enddo
+             enddo
+          enddo
+         enddo
+ 
+ 
+ 
+ c        y=x
+ c        where (mask) y=tmp
+         return
+         end
+ 
+ ! We fail to distribute the loop because the output dependence for the
+ ! two stores to y(l,i,j,k) forces them into the same partition.  This is
+ ! because loop distribution does not promote such dependences into
+ ! constraints on partition ordering
+ 
+ ! { dg-final { scan-tree-dump "distributed: split to 2 loops" "ldist" { xfail 
*-*-* } } }
+ ! { dg-final { cleanup-tree-dump "ldist" } }
Index: trunk/gcc/tree-loop-distribution.c
===================================================================
*** trunk.orig/gcc/tree-loop-distribution.c     2013-09-17 11:51:49.000000000 
+0200
--- trunk/gcc/tree-loop-distribution.c  2013-09-17 14:03:53.378065359 +0200
*************** ssa_name_has_uses_outside_loop_p (tree d
*** 624,630 ****
      {
        gimple use_stmt = USE_STMT (use_p);
        if (!is_gimple_debug (use_stmt)
!         && loop != loop_containing_stmt (use_stmt))
        return true;
      }
  
--- 624,631 ----
      {
        gimple use_stmt = USE_STMT (use_p);
        if (!is_gimple_debug (use_stmt)
!         && loop != loop_containing_stmt (use_stmt)
!         && !flow_loop_nested_p (loop, loop_containing_stmt (use_stmt)))
        return true;
      }
  
*************** classify_partition (loop_p loop, struct
*** 1139,1149 ****
        if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
        {
          if (dump_file && (dump_flags & TDF_DETAILS))
!           fprintf (dump_file, "not generating builtin, partition has "
                     "scalar uses outside of the loop\n");
          partition->kind = PKIND_REDUCTION;
          return;
        }
      }
  
    /* Perform general partition disqualification for builtins.  */
--- 1140,1162 ----
        if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
        {
          if (dump_file && (dump_flags & TDF_DETAILS))
!           fprintf (dump_file, "not trying to generate builtin, partition has "
                     "scalar uses outside of the loop\n");
          partition->kind = PKIND_REDUCTION;
          return;
        }
+ 
+       /* If the partition needs stmts from outer loops fail.
+          ???  We are not properly re-instantiating outer loops
+        when we generate the builtin call for the partition.
+        ???  Rather check the access functions of the DRs.  */
+       if (loop_containing_stmt (stmt) != loop)
+       {
+         if (dump_file && (dump_flags & TDF_DETAILS))
+           fprintf (dump_file, "not trying to generate builtin, partition "
+                    "covers more than one loop\n");
+         return;
+       }
      }
  
    /* Perform general partition disqualification for builtins.  */
*************** similar_memory_accesses (struct graph *r
*** 1305,1312 ****
      if (RDG_MEM_WRITE_STMT (rdg, i)
        || RDG_MEM_READS_STMT (rdg, i))
        EXECUTE_IF_SET_IN_BITMAP (partition2->stmts, 0, j, bj)
!       if (RDG_MEM_WRITE_STMT (rdg, j)
!           || RDG_MEM_READS_STMT (rdg, j))
          {
            FOR_EACH_VEC_ELT (RDG_DATAREFS (rdg, i), k, ref1)
              {
--- 1318,1331 ----
      if (RDG_MEM_WRITE_STMT (rdg, i)
        || RDG_MEM_READS_STMT (rdg, i))
        EXECUTE_IF_SET_IN_BITMAP (partition2->stmts, 0, j, bj)
!       if ((RDG_MEM_WRITE_STMT (rdg, j)
!            || RDG_MEM_READS_STMT (rdg, j))
!           /* But only consider memory accesses related if they
!              happen on the same loop.  This allows the zeroing
!              in an outer loop to be split from the store in the
!              inner loop.  See gfortran.dg/ldist-2.f.  */
!           && (loop_containing_stmt (RDG_STMT (rdg, i))
!               == loop_containing_stmt (RDG_STMT (rdg, j))))
          {
            FOR_EACH_VEC_ELT (RDG_DATAREFS (rdg, i), k, ref1)
              {
*************** distribute_loop (struct loop *loop, vec<
*** 1502,1508 ****
      }
  
    if (dump_file && (dump_flags & TDF_DETAILS))
!     dump_rdg (dump_file, rdg);
  
    partitions.create (3);
    rdg_build_partitions (rdg, stmts, &partitions);
--- 1521,1530 ----
      }
  
    if (dump_file && (dump_flags & TDF_DETAILS))
!     {
!       fprintf (dump_file, "Trying to distribute loop %d\n", loop->num);
!       dump_rdg (dump_file, rdg);
!     }
  
    partitions.create (3);
    rdg_build_partitions (rdg, stmts, &partitions);
*************** tree_loop_distribution (void)
*** 1652,1743 ****
        gimple_set_uid (gsi_stmt (gsi), -1);
      }
  
-   /* We can at the moment only distribute non-nested loops, thus restrict
-      walking to innermost loops.  */
    FOR_EACH_LOOP (li, loop, LI_ONLY_INNERMOST)
      {
        vec<gimple> work_list = vNULL;
        basic_block *bbs;
-       int num = loop->num;
        int nb_generated_loops = 0;
        unsigned int i;
  
        /* If the loop doesn't have a single exit we will fail anyway,
         so do that early.  */
        if (!single_exit (loop))
!       continue;
  
!       /* Only optimize hot loops.  */
!       if (!optimize_loop_for_speed_p (loop))
!       continue;
  
!       /* Initialize the worklist with stmts we seed the partitions with.  */
!       bbs = get_loop_body_in_dom_order (loop);
!       for (i = 0; i < loop->num_nodes; ++i)
        {
!         gimple_stmt_iterator gsi;
!         for (gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
!           {
!             gimple phi = gsi_stmt (gsi);
!             if (virtual_operand_p (gimple_phi_result (phi)))
!               continue;
!             /* Distribute stmts which have defs that are used outside of
!                the loop.  */
!             if (!stmt_has_scalar_dependences_outside_loop (loop, phi))
!               continue;
!             work_list.safe_push (phi);
!           }
!         for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
!           {
!             gimple stmt = gsi_stmt (gsi);
  
!             /* If there is a stmt with side-effects bail out - we
!                cannot and should not distribute this loop.  */
!             if (gimple_has_side_effects (stmt))
                {
!                 work_list.truncate (0);
!                 goto out;
                }
  
!             /* Distribute stmts which have defs that are used outside of
!                the loop.  */
!             if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
!               ;
!             /* Otherwise only distribute stores for now.  */
!             else if (!gimple_assign_single_p (stmt)
!                      || is_gimple_reg (gimple_assign_lhs (stmt)))
!               continue;
  
!             work_list.safe_push (stmt);
            }
!       }
! out:
!       free (bbs);
  
!       if (work_list.length () > 0)
!       {
!         if (!cd)
            {
!             calculate_dominance_info (CDI_POST_DOMINATORS);
!             cd = new control_dependences (create_edge_list ());
!             free_dominance_info (CDI_POST_DOMINATORS);
            }
-         nb_generated_loops = distribute_loop (loop, work_list, cd);
-       }
  
!       if (nb_generated_loops > 0)
!       changed = true;
  
!       if (dump_file && (dump_flags & TDF_DETAILS))
!       {
!         if (nb_generated_loops > 1)
!           fprintf (dump_file, "Loop %d distributed: split to %d loops.\n",
!                    num, nb_generated_loops);
!         else
!           fprintf (dump_file, "Loop %d is the same.\n", num);
        }
  
!       work_list.release ();
      }
  
    if (cd)
--- 1674,1800 ----
        gimple_set_uid (gsi_stmt (gsi), -1);
      }
  
    FOR_EACH_LOOP (li, loop, LI_ONLY_INNERMOST)
      {
        vec<gimple> work_list = vNULL;
        basic_block *bbs;
        int nb_generated_loops = 0;
        unsigned int i;
  
+       /* Only optimize if the innermost loop is hot.  */
+       if (!optimize_loop_for_speed_p (loop))
+       continue;
+ 
        /* If the loop doesn't have a single exit we will fail anyway,
         so do that early.  */
        if (!single_exit (loop))
!       break;
  
!       /* Find the outermost acceptable loop.  Do so only when asked
!          to do real loop distribution as we don't have a pattern
!        covering a loop nest yet.  */
!       if (flag_tree_loop_distribution)
!       for (struct loop *outer = loop_outer (loop);
!            loop_outer (outer) != NULL; outer = loop_outer (outer))
!         {
!           /* If the loop has children with siblings we will fail
!              anyway because data dependence analysis cannot handle
!              this situation.  */
!           if (outer->inner && outer->inner->next != NULL)
!             break;
! 
!           /* If the loop doesn't have a single exit we will fail anyway,
!              so do that early.  */
!           if (!single_exit (outer))
!             break;
  
!           loop = outer;
!         }
! 
!       /* From the outermost acceptable loop try distributing it and stop
!        once that succeeds, otherwise try inner loops.  */
!       do
        {
!         int num = loop->num;
  
!         /* Initialize the worklist with stmts we seed the partitions with.  */
!         bbs = get_loop_body_in_dom_order (loop);
!         for (i = 0; i < loop->num_nodes; ++i)
!           {
!             gimple_stmt_iterator gsi;
!             for (gsi = gsi_start_phis (bbs[i]);
!                  !gsi_end_p (gsi); gsi_next (&gsi))
                {
!                 gimple phi = gsi_stmt (gsi);
!                 if (virtual_operand_p (gimple_phi_result (phi)))
!                   continue;
!                 /* Distribute stmts which have defs that are used outside of
!                    the loop.  */
!                 if (!stmt_has_scalar_dependences_outside_loop (loop, phi))
!                   continue;
!                 work_list.safe_push (phi);
                }
+             for (gsi = gsi_start_bb (bbs[i]);
+                  !gsi_end_p (gsi); gsi_next (&gsi))
+               {
+                 gimple stmt = gsi_stmt (gsi);
+ 
+                 /* If there is a stmt with side-effects bail out - we
+                    cannot and should not distribute this loop.  */
+                 if (gimple_has_side_effects (stmt))
+                   {
+                     work_list.release ();
+                     free (bbs);
+                     goto next_nest;
+                   }
  
!                 /* Distribute stmts which have defs that are used outside of
!                    the loop.  */
!                 if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
!                   ;
!                 /* Otherwise only distribute stores for now.  */
!                 else if (!gimple_assign_single_p (stmt)
!                          || is_gimple_reg (gimple_assign_lhs (stmt)))
!                   continue;
  
!                 work_list.safe_push (stmt);
!               }
            }
!         free (bbs);
  
!         if (work_list.length () > 0)
!           {
!             if (!cd)
!               {
!                 calculate_dominance_info (CDI_POST_DOMINATORS);
!                 cd = new control_dependences (create_edge_list ());
!                 free_dominance_info (CDI_POST_DOMINATORS);
!               }
!             nb_generated_loops = distribute_loop (loop, work_list, cd);
!           }
! 
!         if (nb_generated_loops > 0)
!           changed = true;
! 
!         if (dump_file && (dump_flags & TDF_DETAILS))
            {
!             if (nb_generated_loops > 1)
!               fprintf (dump_file, "Loop %d distributed: split to %d loops.\n",
!                        num, nb_generated_loops);
!             else
!               fprintf (dump_file, "Loop %d is the same.\n", num);
            }
  
!         work_list.release ();
  
!         if (nb_generated_loops > 0)
!           break;
! 
!         loop = loop->inner;
        }
+       while (loop);
  
! next_nest:;
      }
  
    if (cd)
Index: trunk/gcc/testsuite/gcc.dg/tree-ssa/ldist-5.c
===================================================================
*** trunk.orig/gcc/testsuite/gcc.dg/tree-ssa/ldist-5.c  2013-09-17 
11:51:49.000000000 +0200
--- trunk/gcc/testsuite/gcc.dg/tree-ssa/ldist-5.c       2013-09-17 
13:18:27.544714694 +0200
*************** int loop1 (int k)
*** 26,33 ****
    return a[100-1][100-1] + b[100-1][100-1] + c[100-1][100-1] + 
d[100-1][100-1];
  }
  
! /* FIXME: This is XFAILed because of a data dependence analysis
!    problem: the dependence test fails with a "don't know" relation.  */
  
  /* { dg-final { scan-tree-dump-times "distributed: split to 2 loops" 1 
"ldist" { xfail *-*-* } } } */
  /* { dg-final { cleanup-tree-dump "ldist" } } */
--- 26,38 ----
    return a[100-1][100-1] + b[100-1][100-1] + c[100-1][100-1] + 
d[100-1][100-1];
  }
  
! /* FIXME: This is XFAILed because all four initial partitions contain
!    references to a[i][j] and thus they are merged because they have
!    similar memory accesses.  The main reason is that loop-distribution
!    does not consider re-loading flow dependences from memory, turning
!    them into data dependences.  In this case loading c[i][j] when
!    producing d[i][j] instead of re-computing it via its RHS.  This is
!    the result of scalar CSE.  */
  
  /* { dg-final { scan-tree-dump-times "distributed: split to 2 loops" 1 
"ldist" { xfail *-*-* } } } */
  /* { dg-final { cleanup-tree-dump "ldist" } } */

Reply via email to