On Mon, 2015-08-17 at 17:34 +1000, Benjamin Herrenschmidt wrote: > Currently, we get to the slow path for any unaligned access in the > backend, because we effectively preserve the bottom address bits > below the alignment requirement when comparing with the TLB entry, > so any non-0 bit there will cause the compare to fail.
Forget about this one, it was already picked up by Richard, I forgot about it when I did git send-email. The other 4 however are candidate for review/merge. Cheers. Ben. > For the same number of instructions, we can instead add the access > size - 1 to the address and stick to clearing all the bottom bits. > > That means that normal unaligned accesses will not fallback (the HW > will handle them fine). Only when crossing a page boundary well we > end up having a mismatch because we'll end up pointing to the next > page which cannot possibly be in that same TLB entry. > > Signed-off-by: Benjamin Herrenschmidt <b...@kernel.crashing.org> > --- > tcg/ppc/tcg-target.c | 41 +++++++++++++++++++++++++++++++---------- > 1 file changed, 31 insertions(+), 10 deletions(-) > > diff --git a/tcg/ppc/tcg-target.c b/tcg/ppc/tcg-target.c > index 2b6eafa..ce8d546 100644 > --- a/tcg/ppc/tcg-target.c > +++ b/tcg/ppc/tcg-target.c > @@ -1361,7 +1361,7 @@ static void * const qemu_st_helpers[16] = { > in CR7, loads the addend of the TLB into R3, and returns the > register > containing the guest address (zero-extended into R4). Clobbers > R0 and R2. */ > > -static TCGReg tcg_out_tlb_read(TCGContext *s, TCGMemOp s_bits, > +static TCGReg tcg_out_tlb_read(TCGContext *s, TCGMemOp opc, > TCGReg addrlo, TCGReg addrhi, > int mem_index, bool is_read) > { > @@ -1371,6 +1371,7 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, > TCGMemOp s_bits, > : offsetof(CPUArchState, > tlb_table[mem_index][0].addr_write)); > int add_off = offsetof(CPUArchState, > tlb_table[mem_index][0].addend); > TCGReg base = TCG_AREG0; > + TCGMemOp s_bits = opc & MO_SIZE; > > /* Extract the page index, shifted into place for tlb index. */ > if (TCG_TARGET_REG_BITS == 64) { > @@ -1422,17 +1423,37 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, > TCGMemOp s_bits, > to minimize any load use delay. */ > tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R3, TCG_REG_R3, add_off); > > - /* Clear the non-page, non-alignment bits from the address. */ > + /* Clear the non-page, non-alignment bits from the address */ > if (TCG_TARGET_REG_BITS == 32 || TARGET_LONG_BITS == 32) { > + /* We don't support unaligned accesses on 32-bits, preserve > + * the bottom bits and thus trigger a comparison failure on > + * unaligned accesses > + */ > tcg_out_rlw(s, RLWINM, TCG_REG_R0, addrlo, 0, > (32 - s_bits) & 31, 31 - TARGET_PAGE_BITS); > - } else if (!s_bits) { > - tcg_out_rld(s, RLDICR, TCG_REG_R0, addrlo, > - 0, 63 - TARGET_PAGE_BITS); > + } else if (s_bits) { > + /* > byte access, we need to handle alignment */ > + if ((opc & MO_AMASK) == MO_ALIGN) { > + /* Alignment required by the front-end, same as 32-bits > */ > + tcg_out_rld(s, RLDICL, TCG_REG_R0, addrlo, > + 64 - TARGET_PAGE_BITS, TARGET_PAGE_BITS - > s_bits); > + tcg_out_rld(s, RLDICL, TCG_REG_R0, TCG_REG_R0, > TARGET_PAGE_BITS, 0); > + } else { > + /* We support unaligned accesses, we need to make sure we > fail > + * if we cross a page boundary. The trick is to add the > + * access_size-1 to the address before masking the low > bits. > + * That will make the address overflow to the next page > if we > + * cross a page boundary which will then force a mismatch > of > + * the TLB compare since the next page cannot possibly be > in > + * the same TLB index. > + */ > + tcg_out32(s, ADDI | TAI(TCG_REG_R0, addrlo, (1 << > s_bits) - 1)); > + tcg_out_rld(s, RLDICR, TCG_REG_R0, TCG_REG_R0, > + 0, 63 - TARGET_PAGE_BITS); > + } > } else { > - tcg_out_rld(s, RLDICL, TCG_REG_R0, addrlo, > - 64 - TARGET_PAGE_BITS, TARGET_PAGE_BITS - > s_bits); > - tcg_out_rld(s, RLDICL, TCG_REG_R0, TCG_REG_R0, > TARGET_PAGE_BITS, 0); > + /* Byte access, just chop off the bits below the page index > */ > + tcg_out_rld(s, RLDICR, TCG_REG_R0, addrlo, 0, 63 - > TARGET_PAGE_BITS); > } > > if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) { > @@ -1592,7 +1613,7 @@ static void tcg_out_qemu_ld(TCGContext *s, > const TCGArg *args, bool is_64) > > #ifdef CONFIG_SOFTMMU > mem_index = get_mmuidx(oi); > - addrlo = tcg_out_tlb_read(s, s_bits, addrlo, addrhi, mem_index, > true); > + addrlo = tcg_out_tlb_read(s, opc, addrlo, addrhi, mem_index, > true); > > /* Load a pointer into the current opcode w/conditional branch > -link. */ > label_ptr = s->code_ptr; > @@ -1667,7 +1688,7 @@ static void tcg_out_qemu_st(TCGContext *s, > const TCGArg *args, bool is_64) > > #ifdef CONFIG_SOFTMMU > mem_index = get_mmuidx(oi); > - addrlo = tcg_out_tlb_read(s, s_bits, addrlo, addrhi, mem_index, > false); > + addrlo = tcg_out_tlb_read(s, opc, addrlo, addrhi, mem_index, > false); > > /* Load a pointer into the current opcode w/conditional branch > -link. */ > label_ptr = s->code_ptr;