On Tue, Oct 30, 2012 at 1:18 AM, <y...@ohm.aurel32.net> wrote: > From: Aurelien Jarno <aurel...@aurel32.net> > > The TCG arm backend considers likely that the offset to the TLB > entries does not exceed 12 bits for mem_index = 0. In practice this is > not true for at least the MIPS target. > > The current patch fixes that by loading the bits 23-12 with a separate > instruction, and using loads with address writeback, independently of > the value of mem_idx. In total this allow a 24-bit offset, which is a > lot more than needed. > > Cc: Andrzej Zaborowski <balr...@gmail.com> > Cc: Peter Maydell <peter.mayd...@linaro.org> > Cc: qemu-sta...@nongnu.org > Signed-off-by: Aurelien Jarno <aurel...@aurel32.net> > --- > tcg/arm/tcg-target.c | 77 > +++++++++++++++++++++++++++----------------------- > 1 file changed, 41 insertions(+), 36 deletions(-) > > diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c > index e790bf0..03b1576 100644 > --- a/tcg/arm/tcg-target.c > +++ b/tcg/arm/tcg-target.c > @@ -639,6 +639,22 @@ static inline void tcg_out_ld32_12(TCGContext *s, int > cond, > (rn << 16) | (rd << 12) | ((-im) & 0xfff)); > } > > +/* Offset pre-increment with base writeback. */ > +static inline void tcg_out_ld32_12wb(TCGContext *s, int cond, > + int rd, int rn, tcg_target_long im) > +{ > + /* ldr with writeback and both register equals is UNPREDICTABLE */ > + assert(rd != rn); > + > + if (im >= 0) { > + tcg_out32(s, (cond << 28) | 0x05b00000 | > + (rn << 16) | (rd << 12) | (im & 0xfff)); > + } else { > + tcg_out32(s, (cond << 28) | 0x05300000 | > + (rn << 16) | (rd << 12) | ((-im) & 0xfff)); > + } > +} > + > static inline void tcg_out_st32_12(TCGContext *s, int cond, > int rd, int rn, tcg_target_long im) > { > @@ -1071,7 +1087,7 @@ static inline void tcg_out_qemu_ld(TCGContext *s, const > TCGArg *args, int opc) > { > int addr_reg, data_reg, data_reg2, bswap; > #ifdef CONFIG_SOFTMMU > - int mem_index, s_bits; > + int mem_index, s_bits, tlb_offset; > TCGReg argreg; > # if TARGET_LONG_BITS == 64 > int addr_reg2; > @@ -1111,19 +1127,15 @@ static inline void tcg_out_qemu_ld(TCGContext *s, > const TCGArg *args, int opc) > TCG_REG_R0, TCG_REG_R8, CPU_TLB_SIZE - 1); > tcg_out_dat_reg(s, COND_AL, ARITH_ADD, TCG_REG_R0, TCG_AREG0, > TCG_REG_R0, SHIFT_IMM_LSL(CPU_TLB_ENTRY_BITS)); > - /* In the > - * ldr r1 [r0, #(offsetof(CPUArchState, > tlb_table[mem_index][0].addr_read))] > - * below, the offset is likely to exceed 12 bits if mem_index != 0 and > - * not exceed otherwise, so use an > - * add r0, r0, #(mem_index * sizeof *CPUArchState.tlb_table) > - * before. > - */ > - if (mem_index) > + /* We assume that the offset is contained within 24 bits. */ > + tlb_offset = offsetof(CPUArchState, tlb_table[mem_index][0].addr_read); > + assert(tlb_offset & ~0xffffff == 0); > + if (tlb_offset > 0xfff) { > tcg_out_dat_imm(s, COND_AL, ARITH_ADD, TCG_REG_R0, TCG_REG_R0, > - (mem_index << (TLB_SHIFT & 1)) | > - ((16 - (TLB_SHIFT >> 1)) << 8)); > - tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R0, > - offsetof(CPUArchState, tlb_table[0][0].addr_read)); > + 0xa00 | (tlb_offset >> 12));
Isn't it 20 bits rather than 24 bits since the immediate is 8-bit right-rotated by 20? Laurent > + tlb_offset &= 0xfff; > + } > + tcg_out_ld32_12wb(s, COND_AL, TCG_REG_R1, TCG_REG_R0, tlb_offset); > tcg_out_dat_reg(s, COND_AL, ARITH_CMP, 0, TCG_REG_R1, > TCG_REG_R8, SHIFT_IMM_LSL(TARGET_PAGE_BITS)); > /* Check alignment. */ > @@ -1131,15 +1143,14 @@ static inline void tcg_out_qemu_ld(TCGContext *s, > const TCGArg *args, int opc) > tcg_out_dat_imm(s, COND_EQ, ARITH_TST, > 0, addr_reg, (1 << s_bits) - 1); > # if TARGET_LONG_BITS == 64 > - /* XXX: possibly we could use a block data load or writeback in > - * the first access. */ > - tcg_out_ld32_12(s, COND_EQ, TCG_REG_R1, TCG_REG_R0, > - offsetof(CPUArchState, tlb_table[0][0].addr_read) + 4); > + /* XXX: possibly we could use a block data load in the first access. */ > + tcg_out_ld32_12(s, COND_EQ, TCG_REG_R1, TCG_REG_R0, 4); > tcg_out_dat_reg(s, COND_EQ, ARITH_CMP, 0, > TCG_REG_R1, addr_reg2, SHIFT_IMM_LSL(0)); > # endif > tcg_out_ld32_12(s, COND_EQ, TCG_REG_R1, TCG_REG_R0, > - offsetof(CPUArchState, tlb_table[0][0].addend)); > + offsetof(CPUTLBEntry, addend) > + - offsetof(CPUTLBEntry, addr_read)); > > switch (opc) { > case 0: > @@ -1288,7 +1299,7 @@ static inline void tcg_out_qemu_st(TCGContext *s, const > TCGArg *args, int opc) > { > int addr_reg, data_reg, data_reg2, bswap; > #ifdef CONFIG_SOFTMMU > - int mem_index, s_bits; > + int mem_index, s_bits, tlb_offset; > TCGReg argreg; > # if TARGET_LONG_BITS == 64 > int addr_reg2; > @@ -1325,19 +1336,14 @@ static inline void tcg_out_qemu_st(TCGContext *s, > const TCGArg *args, int opc) > TCG_REG_R0, TCG_REG_R8, CPU_TLB_SIZE - 1); > tcg_out_dat_reg(s, COND_AL, ARITH_ADD, TCG_REG_R0, > TCG_AREG0, TCG_REG_R0, > SHIFT_IMM_LSL(CPU_TLB_ENTRY_BITS)); > - /* In the > - * ldr r1 [r0, #(offsetof(CPUArchState, > tlb_table[mem_index][0].addr_write))] > - * below, the offset is likely to exceed 12 bits if mem_index != 0 and > - * not exceed otherwise, so use an > - * add r0, r0, #(mem_index * sizeof *CPUArchState.tlb_table) > - * before. > - */ > - if (mem_index) > + /* We assume that the offset is contained within 24 bits. */ > + tlb_offset = offsetof(CPUArchState, tlb_table[mem_index][0].addr_write); > + if (tlb_offset > 0xfff) { > tcg_out_dat_imm(s, COND_AL, ARITH_ADD, TCG_REG_R0, TCG_REG_R0, > - (mem_index << (TLB_SHIFT & 1)) | > - ((16 - (TLB_SHIFT >> 1)) << 8)); > - tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R0, > - offsetof(CPUArchState, tlb_table[0][0].addr_write)); > + 0xa00 | (tlb_offset >> 12)); > + tlb_offset &= 0xfff; > + } > + tcg_out_ld32_12wb(s, COND_AL, TCG_REG_R1, TCG_REG_R0, tlb_offset); > tcg_out_dat_reg(s, COND_AL, ARITH_CMP, 0, TCG_REG_R1, > TCG_REG_R8, SHIFT_IMM_LSL(TARGET_PAGE_BITS)); > /* Check alignment. */ > @@ -1345,15 +1351,14 @@ static inline void tcg_out_qemu_st(TCGContext *s, > const TCGArg *args, int opc) > tcg_out_dat_imm(s, COND_EQ, ARITH_TST, > 0, addr_reg, (1 << s_bits) - 1); > # if TARGET_LONG_BITS == 64 > - /* XXX: possibly we could use a block data load or writeback in > - * the first access. */ > - tcg_out_ld32_12(s, COND_EQ, TCG_REG_R1, TCG_REG_R0, > - offsetof(CPUArchState, tlb_table[0][0].addr_write) + 4); > + /* XXX: possibly we could use a block data load in the first access. */ > + tcg_out_ld32_12(s, COND_EQ, TCG_REG_R1, TCG_REG_R0, 4); > tcg_out_dat_reg(s, COND_EQ, ARITH_CMP, 0, > TCG_REG_R1, addr_reg2, SHIFT_IMM_LSL(0)); > # endif > tcg_out_ld32_12(s, COND_EQ, TCG_REG_R1, TCG_REG_R0, > - offsetof(CPUArchState, tlb_table[0][0].addend)); > + offsetof(CPUTLBEntry, addend) > + - offsetof(CPUTLBEntry, addr_write)); > > switch (opc) { > case 0: > -- > 1.7.10.4 > >