David Gibson <da...@gibson.dropbear.id.au> writes: > [ Unknown signature status ] > On Mon, Sep 19, 2016 at 04:19:34PM +1000, David Gibson wrote: >> On Fri, Sep 16, 2016 at 04:21:48PM +0530, Nikunj A Dadhania wrote: >> > diff --git a/target-ppc/translate/vsx-impl.inc.c >> > b/target-ppc/translate/vsx-impl.inc.c >> > index eee6052..df278df 100644 >> > --- a/target-ppc/translate/vsx-impl.inc.c >> > +++ b/target-ppc/translate/vsx-impl.inc.c >> > @@ -75,7 +75,6 @@ static void gen_lxvdsx(DisasContext *ctx) >> > static void gen_lxvw4x(DisasContext *ctx) >> > { >> > TCGv EA; >> > - TCGv_i64 tmp; >> > TCGv_i64 xth = cpu_vsrh(xT(ctx->opcode)); >> > TCGv_i64 xtl = cpu_vsrl(xT(ctx->opcode)); >> > if (unlikely(!ctx->vsx_enabled)) { >> > @@ -84,22 +83,14 @@ static void gen_lxvw4x(DisasContext *ctx) >> > } >> > gen_set_access_type(ctx, ACCESS_INT); >> > EA = tcg_temp_new(); >> > - tmp = tcg_temp_new_i64(); >> > >> > gen_addr_reg_index(ctx, EA); >> > - gen_qemu_ld32u_i64(ctx, tmp, EA); >> > - tcg_gen_addi_tl(EA, EA, 4); >> > - gen_qemu_ld32u_i64(ctx, xth, EA); >> > - tcg_gen_deposit_i64(xth, xth, tmp, 32, 32); >> > - >> > - tcg_gen_addi_tl(EA, EA, 4); >> > - gen_qemu_ld32u_i64(ctx, tmp, EA); >> > - tcg_gen_addi_tl(EA, EA, 4); >> > - gen_qemu_ld32u_i64(ctx, xtl, EA); >> > - tcg_gen_deposit_i64(xtl, xtl, tmp, 32, 32); >> > - >> > + tcg_gen_qemu_ld_i64(xth, EA, ctx->mem_idx, MO_LEQ); >> > + gen_helper_deposit32x2(xth, xth); >> > + tcg_gen_addi_tl(EA, EA, 8); >> > + tcg_gen_qemu_ld_i64(xtl, EA, ctx->mem_idx, MO_LEQ); >> > + gen_helper_deposit32x2(xtl, xtl); > > ..and I think this is wrong for BE mode. The deposit32x2 will get the > words in the right order, but the bytes within each word will be wrong > because of the LE mode load on a BE setup.
Since lxvw4x/stxvw4x is available on POWER8. I tried running my test code on BE and LE Fedora24 VM. TCG Results match the POWER8 hardware. The order within the word is not changed. Snippet of the test code at the end of email. Can share full code if needed (maybe will do it in kvm-unit-test) Fedora24VM BE: [fedora@cloudimg ~]$ uname -a Linux cloudimg.localdomain 4.5.5-300.fc24.ppc64 #1 SMP Tue May 24 12:24:54 UTC 2016 ppc64 ppc64 ppc64 GNU/Linux [fedora@cloudimg ~]$ ./lxv_x VRT32 = 00010203 20212223 30313233 40414243 [fedora@cloudimg ~]$ ./stxv_x E0E1E2E3 E4E5E6E7 F0F1F2F3 F4F5F6F7 TCG Result BE: ============== $ ./ppc64-linux-user/qemu-ppc64 -cpu POWER9 lxv_x VRT32 = 00010203 20212223 30313233 40414243 $ ./ppc64-linux-user/qemu-ppc64 -cpu POWER9 stxv_x E0E1E2E3 E4E5E6E7 F0F1F2F3 F4F5F6F7 Fedora24VM LE: ============== [fedora@cloudimg ~]$ uname -a Linux cloudimg.localdomain 4.5.5-300.fc24.ppc64le #1 SMP Tue May 24 12:23:26 UTC 2016 ppc64le ppc64le ppc64le GNU/Linux [fedora@cloudimg ~]$ ./lxv_x VRT32 = 40414243 30313233 20212223 00010203 [fedora@cloudimg ~]$ ./stxv_x F4F5F6F7 F0F1F2F3 E4E5E6E7 E0E1E2E3 TCG Result LE: ============== $ ./ppc64le-linux-user/qemu-ppc64le -cpu POWER9 lxv_x VRT32 = 40414243 30313233 20212223 00010203 $ ./ppc64le-linux-user/qemu-ppc64le -cpu POWER9 stxv_x F4F5F6F7 F0F1F2F3 E4E5E6E7 E0E1E2E3 Regards, Nikunj vsx.h: ====== #define U32_SIZE (sizeof(__vector uint32_t) / sizeof(uint32_t)) typedef union { __vector uint32_t v; uint32_t a[U32_SIZE]; } vuint32_t; static void vec_put_u32(__vector uint32_t v) { int i; vuint32_t u; for (u.v = v, i = 0; i < U32_SIZE; ++i) { printf("%08x ", u.a[i]); } printf("\n"); } static void print4x4(uint32_t *p) { int i; if (!p) return; for(i = 0; i < 4; i++) printf(" %08X ", p[i]); printf("\n"); } lxv_x.c: ======== uint32_t rb32[4] = {0x00010203, 0x20212223, 0x30313233, 0x40414243}; vuint32_t vrt32; asm("lxvw4x %x0, 0, %1 \n\t" \ : "=ws"(vrt32) : "r"(&rb32)); printf("VRT32 = "); vec_put_u32(vrt32); stxv_x.c: ========= vuint32_t vrt32; vrt32.a[0] = 0xE0E1E2E3; vrt32.a[1] = 0xE4E5E6E7; vrt32.a[2] = 0xF0F1F2F3; vrt32.a[3] = 0xF4F5F6F7; asm("stxvw4x %x0, 0, %1 \n\t" \ : : "ws"(vrt32.v), "r"(&rb32)); print4x4(rb32);