https://gcc.gnu.org/bugzilla/show_bug.cgi?id=122505

            Bug ID: 122505
           Summary: [avr] Fix bloated mulpsi3 in the wake of hacking
                    around PR118012
           Product: gcc
           Version: 15.2.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: gjl at gcc dot gnu.org
  Target Milestone: ---

PR118012 is a bad optimization performed by match.pd's

/* (zero_one == 0) ? y : z <op> y -> ((typeof(y))zero_one * z) <op> y */
/* (zero_one != 0) ? z <op> y : y -> ((typeof(y))zero_one * z) <op> y */

These introdude very expensive code like a 32-bit multiplication instead of
just a bit-test.

As there is no way for a backend to disable such patterns, and any such means
is not wanted by the maintainers, the only way to hack around the insane code
is to add patterns to the avr backend that try to rectify that.  The added
patterns are combine insns that map multiplication with a 1-bit constant to
something better.

To that end, avr.md added mulsi3 also for devices that do not have MUL (since
otherwise a libcall will be performed with cannot be insn-combined).  However,
mulsi3 is also used for 24-bit multiplications:

__uint24 mul24 (__uint24 a, __uint24 b)
{
    return a * b;
}

$ avr-gcc -Os -mmcu=atmega103 

generates that:

mul24:
        push r28                 ;  34  [c=4 l=1]  pushqi1/0
        push r29                 ;  35  [c=4 l=1]  pushqi1/0
        in r28,__SP_L__  ;  47  [c=4 l=2]  *movhi/7
        in r29,__SP_H__
        sbiw r28,8       ;  48  [c=8 l=1]  *addhi3/2
        in __tmp_reg__,__SREG__  ;  38  [c=8 l=5]  movhi_sp_r/2
        cli
        out __SP_H__,r29
        out __SREG__,__tmp_reg__
        out __SP_L__,r28
/* prologue: function */
/* frame size = 8 */
/* stack size = 10 */
        std Y+1,r22      ;  49  [c=4 l=1]  movqi_insn/2
        std Y+2,r23      ;  50  [c=4 l=1]  movqi_insn/2
        std Y+3,r24      ;  51  [c=4 l=1]  movqi_insn/2
        std Y+5,r18      ;  52  [c=4 l=1]  movqi_insn/2
        std Y+6,r19      ;  53  [c=4 l=1]  movqi_insn/2
        std Y+7,r20      ;  54  [c=4 l=1]  movqi_insn/2
        ldd r18,Y+1      ;  55  [c=4 l=1]  movqi_insn/3
        ldd r19,Y+2      ;  56  [c=4 l=1]  movqi_insn/3
        ldd r20,Y+3      ;  57  [c=4 l=1]  movqi_insn/3
        ldd r21,Y+4      ;  58  [c=4 l=1]  movqi_insn/3
        ldd r22,Y+5      ;  59  [c=4 l=1]  movqi_insn/3
        ldd r23,Y+6      ;  60  [c=4 l=1]  movqi_insn/3
        ldd r24,Y+7      ;  61  [c=4 l=1]  movqi_insn/3
        ldd r25,Y+8      ;  62  [c=4 l=1]  movqi_insn/3
        call __mulsi3    ;  33  [c=20 l=2]  *mulsi3_call_pr118012
/* epilogue start */
        adiw r28,8       ;  63  [c=8 l=1]  *addhi3/2
        in __tmp_reg__,__SREG__  ;  42  [c=8 l=5]  movhi_sp_r/2
        cli
        out __SP_H__,r29
        out __SREG__,__tmp_reg__
        out __SP_L__,r28
        pop r29          ;  43  [c=4 l=1]  popqi
        pop r28          ;  44  [c=4 l=1]  popqi
        ret              ; 

The expected code is just a __mulpsi3 libcall:

mul24:
        call __mulpsi3   ;  9   [c=20 l=2]  call_value_insn/1
/* epilogue start */
        ret              ;  24  [c=0 l=1]  return

As is turns out, adding a mulpsi3 insn also for cores without MUL can mitigate
that problem.

Reply via email to