https://gcc.gnu.org/bugzilla/show_bug.cgi?id=122440

            Bug ID: 122440
           Summary: [16 Regression] nvptx: '[-PASS:-]{+FAIL:+}
                    gcc.dg/tree-ssa/pr91482.c scan-tree-dump store-merging
                    "New sequence of 1 stores to replace old one of 2
                    stores"'
           Product: gcc
           Version: 16.0
            Status: UNCONFIRMED
          Keywords: missed-optimization, testsuite-fail
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: tschwinge at gcc dot gnu.org
                CC: pinskia at gcc dot gnu.org
  Target Milestone: ---
            Target: nvptx

I've bisected that for '--target=nvptx-none', commit
r16-4081-g966cdec2b23b8049c263e2239a3da38937ae01d2 "gimple-fold/fab: Move
ASSUME_ALIGNED handling to gimple-fold [PR121762]" is causing:

    PASS: gcc.dg/tree-ssa/pr91482.c (test for excess errors)
    PASS: gcc.dg/tree-ssa/pr91482.c scan-tree-dump-times ccp1
"__builtin_assume_aligned" 1
    [-PASS:-]{+FAIL:+} gcc.dg/tree-ssa/pr91482.c scan-tree-dump store-merging
"New sequence of 1 stores to replace old one of 2 stores"

'store-merging' dump and output assembly before vs. after:

     Coalescing successful!
     Merged into 1 stores
    -New sequence of 1 stores to replace old one of 2 stores
    -Merging successful!
     void write64 (void * p)
     {
       unsigned int * p1;

       <bb 2> [local count: 1073741824]:
    -  p1_2 = p_1(D);
    -  MEM <unsigned long> [(unsigned int *)p1_2] = 4294967296;
    +  MEM[(unsigned int *)p_1(D)] = 0;
    +  MEM[(unsigned int *)p_1(D) + 4B] = 1;
       return;

     }

     .visible .func write64 (.param.u64 %in_ar0)
     {
             .reg.u64 %ar0;
             ld.param.u64 %ar0, [%in_ar0];
             .reg.u64 %r22;
    -        .reg.u64 %r23;
    +        .reg.u32 %r23;
    +        .reg.u32 %r24;
                     mov.u64 %r22, %ar0;
    -                mov.u64 %r23, 4294967296;
    -                st.u64  [%r22], %r23;
    +                mov.u32 %r23, 0;
    +                st.u32  [%r22], %r23;
    +                mov.u32 %r24, 1;
    +                st.u32  [%r22+4], %r24;
             ret;
     }


That test case originates in commit d6dea10acfd9d775f260a2e7c319bb1ee64c0af0
(Subversion r274796>) "re PR tree-optimization/91482 (__builtin_assume_aligned
should not break write combining)".  (Doesn't give me any clue.)


For both x86_64 and nvptx, we have the same before vs. after differences in:
'pr91482.c.218t.forwprop4', 'pr91482.c.219t.sink2', 'pr91482.c.220t.phiopt4',
'pr91482.c.221t.fab1', 'pr91482.c.222t.widening_mul', but then x86_64
'pr91482.c.223t.store-merging':

     Processing basic block <2>:
     Starting active chain number 1 with statement:
    -*p1_2 = 0;
    +MEM[(unsigned int *)p_1(D)] = 0;
     The base object is:
    -p1_2
    +p_1(D)
     Recording immediate store from stmt:
    -MEM[(unsigned int *)p1_2 + 4B] = 1;
    +MEM[(unsigned int *)p_1(D) + 4B] = 1;
     stmt causes chain termination:
     return;
     Terminating chain with 2 stores
    @@ -30,7 +30,7 @@
     Merged into 1 stores
     New sequence of 1 stores to replace old one of 2 stores
     # .MEM_5 = VDEF <.MEM_4>
    -MEM <unsigned long> [(unsigned int *)p1_2] = 4294967296;
    +MEM <unsigned long> [(unsigned int *)p_1(D)] = 4294967296;
     Merging successful!

     Pass statistics of "store-merging": ----------------
    @@ -44,11 +44,8 @@
     ;;   basic block 2, loop depth 0, count 1073741824 (estimated locally,
freq 1.0000), maybe hot
     ;;    prev block 0, next block 1, flags: (NEW, REACHABLE, VISITED)
     ;;    pred:       ENTRY [always]  count:1073741824 (estimated locally,
freq 1.0000) (FALLTHRU,EXECUTABLE)
    -  # PT = nonlocal null
    -  # ALIGN = 8, MISALIGN = 0
    -  gimple_assign <ssa_name, p1_2, p_1(D), NULL, NULL>
       # .MEM_5 = VDEF <.MEM_3(D)>
    -  gimple_assign <integer_cst, MEM <unsigned long> [(unsigned intD.9
*)p1_2], 4294967296, NULL, NULL>
    +  gimple_assign <integer_cst, MEM <unsigned long> [(unsigned intD.9
*)p_1(D)], 4294967296, NULL, NULL>
       # VUSE <.MEM_5>
       gimple_return <NULL>
     ;;    succ:       EXIT [always]  count:1073741824 (estimated locally, freq
1.0000) (EXECUTABLE)
[...]/source-gcc/gcc/testsuite/gcc.dg/tree-ssa/pr91482.c:10:1

... vs. nvptx:

    [same as for x86_64]
     Merged into 1 stores
    -New sequence of 1 stores to replace old one of 2 stores
    -# .MEM_5 = VDEF <.MEM_4>
    -MEM <unsigned long> [(unsigned int *)p1_2] = 4294967296;
    -Merging successful!
    +Exceeded original number of stmts (2).  Not profitable to emit new
sequence.

     Pass statistics of "store-merging": ----------------

    @@ -44,11 +41,10 @@
     ;;   basic block 2, loop depth 0, count 1073741824 (estimated locally,
freq 1.0000), maybe hot
     ;;    prev block 0, next block 1, flags: (NEW, REACHABLE, VISITED)
     ;;    pred:       ENTRY [always]  count:1073741824 (estimated locally,
freq 1.0000) (FALLTHRU,EXECUTABLE)
    -  # PT = nonlocal null
    -  # ALIGN = 8, MISALIGN = 0
    -  gimple_assign <ssa_name, p1_2, p_1(D), NULL, NULL>
    -  # .MEM_5 = VDEF <.MEM_3(D)>
    -  gimple_assign <integer_cst, MEM <unsigned long> [(unsigned intD.4
*)p1_2], 4294967296, NULL, NULL>
    +  # .MEM_4 = VDEF <.MEM_3(D)>
    +  gimple_assign <integer_cst, MEM[(unsigned intD.4 *)p_1(D)], 0, NULL,
NULL>
    +  # .MEM_5 = VDEF <.MEM_4>
    +  gimple_assign <integer_cst, MEM[(unsigned intD.4 *)p_1(D) + 4B], 1,
NULL, NULL>
       # VUSE <.MEM_5>
       gimple_return <NULL>
     ;;    succ:       EXIT [always]  count:1073741824 (estimated locally, freq
1.0000) (EXECUTABLE)
[...]/source-gcc/gcc/testsuite/gcc.dg/tree-ssa/pr91482.c:10:1

So: 'Exceeded original number of stmts (2).  Not profitable to emit new
sequence.'  Is that another (missing) GCC/nvptx target instruction costing
issue?  (I shall look into that, really!)

Reply via email to