On 09/22/2016 04:53 PM, Joseph Myers wrote:
MAX_OP_PER_INSTR is currently 266, reported in commit
14dcdac82f398cbac874c8579b9583fab31c67bf to be the worst case for the
ARM A64 decoder.

Whether or not it was in fact the worst case at that time in 2014, I'm
observing the instruction 0x4c006020 (st1 {v0.16b-v2.16b}, [x1])
generate 386 ops from disas_ldst_multiple_struct with current sources,

For the record, I reproduce your results on a 32-bit host with v0-v3. I assume the v2 here is a typo.

While increasing the max per insn is indeed one way to approach this, aarch64 is being remarkably inefficient in this case. With the following, I see a reduction from 387 ops to 261 ops; for a 64-bit host, the reduction is from 258 ops to 195 ops.

I should also note that the implementation of this insn should be even simpler. I see this insn as performing 8 64-bit, little-endian, unaligned loads. We should be able to implement this insn for a 64-bit host in about 25 ops, which implies that the current code is nearly 8 times too large.

The same should be true for other combinations of sizes for ldst. I recognize that it gets more complicated for big-endian guest and element sizes larger than 1, but for element sizes larger than 1 we automatically have <= half of the number of ops seen here.


r~
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index ddf52f5..e44bf96 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -2536,7 +2536,7 @@ static void disas_ldst_multiple_struct(DisasContext *s, 
uint32_t insn)
     bool is_store = !extract32(insn, 22, 1);
     bool is_postidx = extract32(insn, 23, 1);
     bool is_q = extract32(insn, 30, 1);
-    TCGv_i64 tcg_addr, tcg_rn;
+    TCGv_i64 tcg_addr, tcg_rn, tcg_ebytes;
 
     int ebytes = 1 << size;
     int elements = (is_q ? 128 : 64) / (8 << size);
@@ -2601,6 +2601,7 @@ static void disas_ldst_multiple_struct(DisasContext *s, 
uint32_t insn)
     tcg_rn = cpu_reg_sp(s, rn);
     tcg_addr = tcg_temp_new_i64();
     tcg_gen_mov_i64(tcg_addr, tcg_rn);
+    tcg_ebytes = tcg_const_i64(ebytes);
 
     for (r = 0; r < rpt; r++) {
         int e;
@@ -2624,7 +2625,7 @@ static void disas_ldst_multiple_struct(DisasContext *s, 
uint32_t insn)
                         clear_vec_high(s, tt);
                     }
                 }
-                tcg_gen_addi_i64(tcg_addr, tcg_addr, ebytes);
+                tcg_gen_add_i64(tcg_addr, tcg_addr, tcg_ebytes);
                 tt = (tt + 1) % 32;
             }
         }
@@ -2638,6 +2639,7 @@ static void disas_ldst_multiple_struct(DisasContext *s, 
uint32_t insn)
             tcg_gen_add_i64(tcg_rn, tcg_rn, cpu_reg(s, rm));
         }
     }
+    tcg_temp_free_i64(tcg_ebytes);
     tcg_temp_free_i64(tcg_addr);
 }
 
@@ -2680,7 +2682,7 @@ static void disas_ldst_single_struct(DisasContext *s, 
uint32_t insn)
     bool replicate = false;
     int index = is_q << 3 | S << 2 | size;
     int ebytes, xs;
-    TCGv_i64 tcg_addr, tcg_rn;
+    TCGv_i64 tcg_addr, tcg_rn, tcg_ebytes;
 
     switch (scale) {
     case 3:
@@ -2733,6 +2735,7 @@ static void disas_ldst_single_struct(DisasContext *s, 
uint32_t insn)
     tcg_rn = cpu_reg_sp(s, rn);
     tcg_addr = tcg_temp_new_i64();
     tcg_gen_mov_i64(tcg_addr, tcg_rn);
+    tcg_ebytes = tcg_const_i64(ebytes);
 
     for (xs = 0; xs < selem; xs++) {
         if (replicate) {
@@ -2776,7 +2779,7 @@ static void disas_ldst_single_struct(DisasContext *s, 
uint32_t insn)
                 do_vec_st(s, rt, index, tcg_addr, s->be_data + scale);
             }
         }
-        tcg_gen_addi_i64(tcg_addr, tcg_addr, ebytes);
+        tcg_gen_add_i64(tcg_addr, tcg_addr, tcg_ebytes);
         rt = (rt + 1) % 32;
     }
 
@@ -2788,6 +2791,7 @@ static void disas_ldst_single_struct(DisasContext *s, 
uint32_t insn)
             tcg_gen_add_i64(tcg_rn, tcg_rn, cpu_reg(s, rm));
         }
     }
+    tcg_temp_free_i64(tcg_ebytes);
     tcg_temp_free_i64(tcg_addr);
 }
 

Reply via email to