Re: [Qemu-devel] [PATCH] tcg: increase MAX_OP_PER_INSTR to 395

Richard Henderson Fri, 23 Sep 2016 13:00:59 -0700

On 09/22/2016 04:53 PM, Joseph Myers wrote:

MAX_OP_PER_INSTR is currently 266, reported in commit
14dcdac82f398cbac874c8579b9583fab31c67bf to be the worst case for the
ARM A64 decoder.


Whether or not it was in fact the worst case at that time in 2014, I'm
observing the instruction 0x4c006020 (st1 {v0.16b-v2.16b}, [x1])
generate 386 ops from disas_ldst_multiple_struct with current sources,

For the record, I reproduce your results on a 32-bit host with v0-v3. I assumethe v2 here is a typo.

While increasing the max per insn is indeed one way to approach this, aarch64is being remarkably inefficient in this case. With the following, I see areduction from 387 ops to 261 ops; for a 64-bit host, the reduction is from 258ops to 195 ops.

I should also note that the implementation of this insn should be even simpler.I see this insn as performing 8 64-bit, little-endian, unaligned loads. Weshould be able to implement this insn for a 64-bit host in about 25 ops, whichimplies that the current code is nearly 8 times too large.

The same should be true for other combinations of sizes for ldst. I recognizethat it gets more complicated for big-endian guest and element sizes largerthan 1, but for element sizes larger than 1 we automatically have <= half ofthe number of ops seen here.

r~

diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index ddf52f5..e44bf96 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -2536,7 +2536,7 @@ static void disas_ldst_multiple_struct(DisasContext *s, 
uint32_t insn)
     bool is_store = !extract32(insn, 22, 1);
     bool is_postidx = extract32(insn, 23, 1);
     bool is_q = extract32(insn, 30, 1);
-    TCGv_i64 tcg_addr, tcg_rn;
+    TCGv_i64 tcg_addr, tcg_rn, tcg_ebytes;
 
     int ebytes = 1 << size;
     int elements = (is_q ? 128 : 64) / (8 << size);
@@ -2601,6 +2601,7 @@ static void disas_ldst_multiple_struct(DisasContext *s, 
uint32_t insn)
     tcg_rn = cpu_reg_sp(s, rn);
     tcg_addr = tcg_temp_new_i64();
     tcg_gen_mov_i64(tcg_addr, tcg_rn);
+    tcg_ebytes = tcg_const_i64(ebytes);
 
     for (r = 0; r < rpt; r++) {
         int e;
@@ -2624,7 +2625,7 @@ static void disas_ldst_multiple_struct(DisasContext *s, 
uint32_t insn)
                         clear_vec_high(s, tt);
                     }
                 }
-                tcg_gen_addi_i64(tcg_addr, tcg_addr, ebytes);
+                tcg_gen_add_i64(tcg_addr, tcg_addr, tcg_ebytes);
                 tt = (tt + 1) % 32;
             }
         }
@@ -2638,6 +2639,7 @@ static void disas_ldst_multiple_struct(DisasContext *s, 
uint32_t insn)
             tcg_gen_add_i64(tcg_rn, tcg_rn, cpu_reg(s, rm));
         }
     }
+    tcg_temp_free_i64(tcg_ebytes);
     tcg_temp_free_i64(tcg_addr);
 }
 
@@ -2680,7 +2682,7 @@ static void disas_ldst_single_struct(DisasContext *s, 
uint32_t insn)
     bool replicate = false;
     int index = is_q << 3 | S << 2 | size;
     int ebytes, xs;
-    TCGv_i64 tcg_addr, tcg_rn;
+    TCGv_i64 tcg_addr, tcg_rn, tcg_ebytes;
 
     switch (scale) {
     case 3:
@@ -2733,6 +2735,7 @@ static void disas_ldst_single_struct(DisasContext *s, 
uint32_t insn)
     tcg_rn = cpu_reg_sp(s, rn);
     tcg_addr = tcg_temp_new_i64();
     tcg_gen_mov_i64(tcg_addr, tcg_rn);
+    tcg_ebytes = tcg_const_i64(ebytes);
 
     for (xs = 0; xs < selem; xs++) {
         if (replicate) {
@@ -2776,7 +2779,7 @@ static void disas_ldst_single_struct(DisasContext *s, 
uint32_t insn)
                 do_vec_st(s, rt, index, tcg_addr, s->be_data + scale);
             }
         }
-        tcg_gen_addi_i64(tcg_addr, tcg_addr, ebytes);
+        tcg_gen_add_i64(tcg_addr, tcg_addr, tcg_ebytes);
         rt = (rt + 1) % 32;
     }
 
@@ -2788,6 +2791,7 @@ static void disas_ldst_single_struct(DisasContext *s, 
uint32_t insn)
             tcg_gen_add_i64(tcg_rn, tcg_rn, cpu_reg(s, rm));
         }
     }
+    tcg_temp_free_i64(tcg_ebytes);
     tcg_temp_free_i64(tcg_addr);
 }

Re: [Qemu-devel] [PATCH] tcg: increase MAX_OP_PER_INSTR to 395

Reply via email to