Re: [PATCH v4 16/16] tcg/loongarch64: Implement 128-bit load & store

2023-09-09 Thread Richard Henderson

On 9/7/23 19:21, Jiajie Chen wrote:

+static void tcg_out_qemu_ldst_i128(TCGContext *s, TCGReg data_lo, TCGReg 
data_hi,
+   TCGReg addr_reg, MemOpIdx oi, bool is_ld)
+{
+TCGLabelQemuLdst *ldst;
+HostAddress h;
+
+ldst = prepare_host_addr(s, , addr_reg, oi, true);


Final argument here should be is_ld.

Since this is the only remaining error, I will fix while queuing.

Reviewed-by: Richard Henderson 

r~



[PATCH v4 16/16] tcg/loongarch64: Implement 128-bit load & store

2023-09-07 Thread Jiajie Chen
If LSX is available, use LSX instructions to implement 128-bit load &
store when MO_128 is required, otherwise use two 64-bit loads & stores.

Signed-off-by: Jiajie Chen 
---
 tcg/loongarch64/tcg-target-con-set.h |  2 +
 tcg/loongarch64/tcg-target.c.inc | 59 
 tcg/loongarch64/tcg-target.h |  2 +-
 3 files changed, 62 insertions(+), 1 deletion(-)

diff --git a/tcg/loongarch64/tcg-target-con-set.h 
b/tcg/loongarch64/tcg-target-con-set.h
index 914572d21b..77d62e38e7 100644
--- a/tcg/loongarch64/tcg-target-con-set.h
+++ b/tcg/loongarch64/tcg-target-con-set.h
@@ -18,6 +18,7 @@ C_O0_I1(r)
 C_O0_I2(rZ, r)
 C_O0_I2(rZ, rZ)
 C_O0_I2(w, r)
+C_O0_I3(r, r, r)
 C_O1_I1(r, r)
 C_O1_I1(w, r)
 C_O1_I1(w, w)
@@ -37,3 +38,4 @@ C_O1_I2(w, w, wM)
 C_O1_I2(w, w, wA)
 C_O1_I3(w, w, w, w)
 C_O1_I4(r, rZ, rJ, rZ, rZ)
+C_O2_I1(r, r, r)
diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index 82901d678a..6e9f334fed 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -1081,6 +1081,48 @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg 
data_reg, TCGReg addr_reg,
 }
 }
 
+static void tcg_out_qemu_ldst_i128(TCGContext *s, TCGReg data_lo, TCGReg 
data_hi,
+   TCGReg addr_reg, MemOpIdx oi, bool is_ld)
+{
+TCGLabelQemuLdst *ldst;
+HostAddress h;
+
+ldst = prepare_host_addr(s, , addr_reg, oi, true);
+
+if (h.aa.atom == MO_128) {
+/*
+ * Use VLDX/VSTX when 128-bit atomicity is required.
+ * If address is aligned to 16-bytes, the 128-bit load/store is atomic.
+ */
+if (is_ld) {
+tcg_out_opc_vldx(s, TCG_VEC_TMP0, h.base, h.index);
+tcg_out_opc_vpickve2gr_d(s, data_lo, TCG_VEC_TMP0, 0);
+tcg_out_opc_vpickve2gr_d(s, data_hi, TCG_VEC_TMP0, 1);
+} else {
+tcg_out_opc_vinsgr2vr_d(s, TCG_VEC_TMP0, data_lo, 0);
+tcg_out_opc_vinsgr2vr_d(s, TCG_VEC_TMP0, data_hi, 1);
+tcg_out_opc_vstx(s, TCG_VEC_TMP0, h.base, h.index);
+}
+} else {
+/* otherwise use a pair of LD/ST */
+tcg_out_opc_add_d(s, TCG_REG_TMP0, h.base, h.index);
+if (is_ld) {
+tcg_out_opc_ld_d(s, data_lo, TCG_REG_TMP0, 0);
+tcg_out_opc_ld_d(s, data_hi, TCG_REG_TMP0, 8);
+} else {
+tcg_out_opc_st_d(s, data_lo, TCG_REG_TMP0, 0);
+tcg_out_opc_st_d(s, data_hi, TCG_REG_TMP0, 8);
+}
+}
+
+if (ldst) {
+ldst->type = TCG_TYPE_I128;
+ldst->datalo_reg = data_lo;
+ldst->datahi_reg = data_hi;
+ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
+}
+}
+
 /*
  * Entry-points
  */
@@ -1145,6 +1187,7 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 TCGArg a0 = args[0];
 TCGArg a1 = args[1];
 TCGArg a2 = args[2];
+TCGArg a3 = args[3];
 int c2 = const_args[2];
 
 switch (opc) {
@@ -1507,6 +1550,10 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 case INDEX_op_qemu_ld_a64_i64:
 tcg_out_qemu_ld(s, a0, a1, a2, TCG_TYPE_I64);
 break;
+case INDEX_op_qemu_ld_a32_i128:
+case INDEX_op_qemu_ld_a64_i128:
+tcg_out_qemu_ldst_i128(s, a0, a1, a2, a3, true);
+break;
 case INDEX_op_qemu_st_a32_i32:
 case INDEX_op_qemu_st_a64_i32:
 tcg_out_qemu_st(s, a0, a1, a2, TCG_TYPE_I32);
@@ -1515,6 +1562,10 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 case INDEX_op_qemu_st_a64_i64:
 tcg_out_qemu_st(s, a0, a1, a2, TCG_TYPE_I64);
 break;
+case INDEX_op_qemu_st_a32_i128:
+case INDEX_op_qemu_st_a64_i128:
+tcg_out_qemu_ldst_i128(s, a0, a1, a2, a3, false);
+break;
 
 case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
 case INDEX_op_mov_i64:
@@ -1996,6 +2047,14 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode 
op)
 case INDEX_op_qemu_st_a64_i64:
 return C_O0_I2(rZ, r);
 
+case INDEX_op_qemu_ld_a32_i128:
+case INDEX_op_qemu_ld_a64_i128:
+return C_O2_I1(r, r, r);
+
+case INDEX_op_qemu_st_a32_i128:
+case INDEX_op_qemu_st_a64_i128:
+return C_O0_I3(r, r, r);
+
 case INDEX_op_brcond_i32:
 case INDEX_op_brcond_i64:
 return C_O0_I2(rZ, rZ);
diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
index 67b0a95532..03017672f6 100644
--- a/tcg/loongarch64/tcg-target.h
+++ b/tcg/loongarch64/tcg-target.h
@@ -171,7 +171,7 @@ extern bool use_lsx_instructions;
 #define TCG_TARGET_HAS_muluh_i641
 #define TCG_TARGET_HAS_mulsh_i641
 
-#define TCG_TARGET_HAS_qemu_ldst_i128   0
+#define TCG_TARGET_HAS_qemu_ldst_i128   use_lsx_instructions
 
 #define TCG_TARGET_HAS_v64  0
 #define TCG_TARGET_HAS_v128 use_lsx_instructions
-- 
2.42.0