On Mon, Aug 19, 2019 at 10:36 PM Florian Klaempfl <flor...@freepascal.org> wrote:
> Am 19.08.2019 um 22:20 schrieb Christo Crause: > > I'm interested in trying to improve the code generated for shift > > operations (in particular involving a compile time constant shift) for > > the AVR target. > For me the idea looks good, actually, I planned once to do the same, but > got stuck for time reasons with the beginnings you discovered. > Attached a patch that optimizes SHL and SHR for constant right value for comment. Note that I haven't implemented support for SAR, ROL or ROR, these operations should fall through to the current code path. A heuristic to decide when loop unrolling is favoured is also included, taking into account -Os. Some of the details behind the optimizations are discussed here: https://github.com/ccrause/freepascal/wiki/Optimizing-code-generation-for-shift-with-compile-time-constant One remaining optimization I would like to implement is to eliminate loading part of a variable to a register if it is eliminated by the shift amount. The loading of the variable and saving of the result from/to memory happens before tcgavr.a_op_const_reg_reg gets called. Is there a way to remove such a redundant load instruction from tcgavr.a_op_const_reg_reg? Or at least mark it for removal at a later stage ( peephole optimizer)? I assume it would make sense to also include SAR support in this patch, since it could be generated by div? best wishes, Christo
diff --git compiler/avr/cgcpu.pas compiler/avr/cgcpu.pas index 4be4f4823e..ba6a06427e 100644 --- compiler/avr/cgcpu.pas +++ compiler/avr/cgcpu.pas @@ -438,6 +438,11 @@ unit cgcpu; procedure tcgavr.a_op_const_reg_reg(list: TAsmList; op: TOpCg; size: tcgsize; a: tcgint; src, dst: tregister); + var + tmpSrc, tmpDst, countreg: TRegister; + b, b2, i, j: byte; + s1, s2, t1: integer; + l1: TAsmLabel; begin if (op in [OP_MUL,OP_IMUL]) and (size in [OS_16,OS_S16]) and (a in [2,4,8]) then begin @@ -451,6 +456,88 @@ unit cgcpu; a:=a shr 1; end; end + else if (op in [OP_SHL, OP_SHR]) and (a > 0) then // a = 0 get eliminated later by tcg.optimize_op_const + begin + b := a div 8; // number of bytes to shift + + // copy from src to dst accounting for shift offset + for i := 0 to (tcgsize2size[size]-b-1) do + if op = OP_SHL then + a_load_reg_reg(list, OS_8, OS_8, + GetOffsetReg64(src, NR_NO, i), + GetOffsetReg64(dst, NR_NO, i+b)) + else + a_load_reg_reg(list, OS_8, OS_8, + GetOffsetReg64(src, NR_NO, i+b), + GetOffsetReg64(dst, NR_NO, i)); + + b2 := a mod 8; // remaining bit shifts + if b2 > 0 then + begin + // Cost of loop + s1 := 3 + tcgsize2size[size] - b; + t1 := b2*(tcgsize2size[size] - b + 3); + //Cost of loop unrolling, t2 = s2 + s2 := b2*(tcgsize2size[size] - b); + + if ((cs_opt_size in current_settings.optimizerswitches) and (s1 < s2)) or + (((s2 - s1) - t1/s2) > 0) then + begin + // Shift non-moved bytes in loop + current_asmdata.getjumplabel(l1); + countreg:=getintregister(list,OS_8); + a_load_const_reg(list,OS_8,b2,countreg); + cg.a_label(list,l1); + if op = OP_SHL then + list.concat(taicpu.op_reg(A_LSL,GetOffsetReg64(dst, NR_NO, b))) + else + list.concat(taicpu.op_reg(A_LSR,GetOffsetReg64(dst, NR_NO, tcgsize2size[size]-1-b))); + + if size in [OS_S16,OS_16,OS_S32,OS_32,OS_S64,OS_64] then + begin + for i:=2+b to tcgsize2size[size] do + if op = OP_SHL then + list.concat(taicpu.op_reg(A_ROL,GetOffsetReg64(dst,NR_NO,i-1))) + else + list.concat(taicpu.op_reg(A_ROR,GetOffsetReg64(dst, NR_NO,tcgsize2size[size]-i-b))); + end; + list.concat(taicpu.op_reg(A_DEC,countreg)); + a_jmp_flags(list,F_NE,l1); + // keep registers alive + list.concat(taicpu.op_reg_reg(A_MOV,countreg,countreg)); + end + else + begin + // Unroll shift loop over non-moved bytes + for j := 1 to b2 do + begin + if op = OP_SHL then + list.concat(taicpu.op_reg(A_LSL, + GetOffsetReg64(dst, NR_NO, b))) + else + list.concat(taicpu.op_reg(A_LSR, + GetOffsetReg64(dst, NR_NO, tcgsize2size[size]-b-1))); + + if not(size in [OS_8, OS_S8]) then + for i := 2 to tcgsize2size[size]-b do + if op = OP_SHL then + list.concat(taicpu.op_reg(A_ROL, + GetOffsetReg64(dst, NR_NO, b+i-1))) + else + list.concat(taicpu.op_reg(A_ROR, + GetOffsetReg64(dst, NR_NO, tcgsize2size[size]-b-i))); + end; + end; + end; + + // fill skipped destination registers with 0 + // Do last, then optimizer can optimize register moves + for i := 1 to b do + if op = OP_SHL then + emit_mov(list, GetOffsetReg64(dst, NR_NO, i-1), NR_R1) + else + emit_mov(list, GetOffsetReg64(dst, NR_NO, tcgsize2size[size]-i), NR_R1); + end else inherited a_op_const_reg_reg(list,op,size,a,src,dst); end;
_______________________________________________ fpc-devel maillist - fpc-devel@lists.freepascal.org https://lists.freepascal.org/cgi-bin/mailman/listinfo/fpc-devel