| Issue |
168570
|
| Summary |
[ARM] CostPerUse for high registers should only apply in instructions with 16-bit variants
|
| Labels |
new issue
|
| Assignees |
|
| Reporter |
john-brawn-arm
|
ARMRegisterInfo.td defines the integer registers like this:
```
// Integer registers
def R0 : ARMReg< 0, "r0">, DwarfRegNum<[0]>;
def R1 : ARMReg< 1, "r1">, DwarfRegNum<[1]>;
def R2 : ARMReg< 2, "r2">, DwarfRegNum<[2]>;
def R3 : ARMReg< 3, "r3">, DwarfRegNum<[3]>;
def R4 : ARMReg< 4, "r4">, DwarfRegNum<[4]>;
def R5 : ARMReg< 5, "r5">, DwarfRegNum<[5]>;
def R6 : ARMReg< 6, "r6">, DwarfRegNum<[6]>;
def R7 : ARMReg< 7, "r7">, DwarfRegNum<[7]>;
// These require 32-bit instructions.
let CostPerUse = [1] in {
def R8 : ARMReg< 8, "r8">, DwarfRegNum<[8]>;
def R9 : ARMReg< 9, "r9">, DwarfRegNum<[9]>;
def R10 : ARMReg<10, "r10">, DwarfRegNum<[10]>;
def R11 : ARMReg<11, "r11">, DwarfRegNum<[11]>;
def R12 : ARMReg<12, "r12">, DwarfRegNum<[12]>;
let RegAltNameIndices = [RegNamesRaw] in {
def SP : ARMReg<13, "sp", [], ["r13"]>, DwarfRegNum<[13]>;
def LR : ARMReg<14, "lr", [], ["r14"]>, DwarfRegNum<[14]>;
def PC : ARMReg<15, "pc", [], ["r15"]>, DwarfRegNum<[15]>;
}
```
The additional CostPerUse on high registers only makes sense for instructions that have 16-bit variants. The extra CostPerUse can lead to us inserting extra MOV instructions to/from high registers if it results in more instructions that use low registers. An example is the following, reduced from https://github.com/ARM-software/CMSIS-DSP/blob/main/Source/FilteringFunctions/arm_fir_q7.c:
```
#include <arm_mve.h>
void arm_fir_q7_1_16_mve(signed char *pState,
const signed char *pCoeffs,
unsigned int numTaps,
const signed char * __restrict pSrc,
signed char * __restrict pDst,
unsigned int blockSize)
{
int8x16_t vecCoeff = vldrbq_s8(pCoeffs); signed char *pStateCur = &(pState[(numTaps - 1u)]);
const signed char *pTempSrc = pSrc;
const signed char *pSamples = pState;
signed char *pOutput = pDst; signed int blkCnt = blockSize >> 2;
while (blkCnt > 0) {
vstrbq_s32(pStateCur, vldrbq_s32(pTempSrc));
pStateCur += 4;
pTempSrc += 4;
for (int j = 0; j < 4; j++) {
int8x16_t vecIn0 = vld1q(pSamples + j);
signed int acc = vmladavaq(0, vecIn0, vecCoeff);
*pOutput++ = __builtin_arm_ssat((acc >> 7U), 8);
}
pSamples += 4;
blkCnt--;
} signed int residual = blockSize & 3;
for (int i = 0; i < residual; i++)
*pStateCur++ = *pTempSrc++; for (int j = 0; j < residual; j++) {
int8x16_t vecIn0 = vld1q(pSamples + j);
signed int acc = vmladavaq(0, vecIn0, vecCoeff);
*pOutput++ = __builtin_arm_ssat((acc >> 7U), 8);
} pTempSrc = &pState[blockSize];
signed char *pTempDest = pState;
blkCnt = numTaps - 1;
do {
mve_pred16_t p = vctp8q(blkCnt);
vstrbq_p_s8(pTempDest, vldrbq_z_s8(pTempSrc, p), p);
pTempSrc += 16;
pTempDest += 16;
blkCnt -= 16;
}
while (blkCnt > 0);
}
```
When compiled with `clang --target=arm-none-eabi -mcpu=cortex-m55 -O3 -ffast-math -mfloat-abi=hard` the main loop looks like this:
```
.LBB0_2: @ =>This Inner Loop Header: Depth=1
vldrb.u32 q1, [r3], #4
vstrb.32 q1, [r5], #4
mov r9, r5
vldrb.u8 q1, [r4], #4
vmlav.s8 r2, q1, q0
vldrb.u8 q1, [r4, #-3]
vmlav.s8 r10, q1, q0
vldrb.u8 q1, [r4, #-2]
vmlav.s8 r8, q1, q0
vldrb.u8 q1, [r4, #-1]
vmlav.s8 r12, q1, q0
ssat r7, #8, r2, asr #7
ssat r5, #8, r10, asr #7
ssat r2, #8, r8, asr #7
strb r7, [r1], #4
ssat r7, #8, r12, asr #7
strb r5, [r1, #-3]
mov r5, r9
strb r2, [r1, #-2]
strb r7, [r1, #-1]
le lr, .LBB0_2
```
The register allocator has preferred to have a mov from r5 to r9 and back instead of using r9 in an ssat and strb instruction. But these instructions don't have 16-bit variants, so we're getting 32-bit instructions no matter what, so all we've done is pointlessly add two extra mov instructions to the loop.
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs