Hi,
This patch updates the schedule model to be more accurate and model
SIMD and fp instructions that I had missed out when I had the last
patch.
OK? Bootstrapped and tested on aarch64-linux-gnu with no regeessions.
Thanks,
Andrew Pinski
ChangeLog:
* config/aarch64/thunderx.md (thunderx_shift): Add rbit and rev.
(thunderx_crc32): New reservation.
(thunderx_fmov): Add fcsel, ffarithd and ffariths.
(thunderx_fabs): New reservation.
(thunderx_fcsel): New reservation.
(thunderx_fcmp): New reservation.
(thunderx_fsqrtd): Correct latency.
(thunderx_frint): Add f_cvt.
(thunderx_f_cvt): Remove f_cvt.
(thunderx_simd_fp_store): Add neon_store1_one_lane
and neon_store1_one_lane_q.
(thunderx_neon_ld1): New reservation.
(thunderx_neon_move): Add neon_dup.
neon_ins, neon_from_gp, neon_to_gp,
neon_abs, neon_neg,
neon_fp_neg_s, and neon_fp_abs_s.
(thunderx_neon_move_q): Add neon_dup_q,
neon_ins_q, neon_from_gp_q, neon_to_gp_q,
neon_abs_q, neon_neg_q,
neon_fp_neg_s_q, neon_fp_neg_d_q,
neon_fp_abs_s_q, and neon_fp_abs_d_q.
(thunderx_neon_add): Add neon_arith_acc, neon_rev, neon_fp_abd_s,
neon_fp_abd_d, and neon_fp_reduc_minmax_s.
(thunderx_neon_add_q): Add neon_fp_abd_s_q, neon_fp_abd_d_q,
neon_arith_acc_q, neon_rev_q,
neon_fp_reduc_minmax_s_q, and neon_fp_reduc_minmax_d_q.
(thunderx_neon_mult): New reservation.
(thunderx_neon_mult_q): New reservation.
(thunderx_crypto_aese): New reservation.
(thunderx_crypto_aesmc): New reservation.
(bypasses): Add bypass to thunderx_neon_mult_q.
(thunderx_tbl): New reservation.
(thunderx_tblq): New reservation.
Index: config/aarch64/thunderx.md
===
--- config/aarch64/thunderx.md (revision 224856)
+++ config/aarch64/thunderx.md (working copy)
@@ -39,7 +39,7 @@ (define_insn_reservation thunderx_add
(define_insn_reservation thunderx_shift 1
(and (eq_attr tune thunderx)
- (eq_attr type bfm,extend,shift_imm,shift_reg))
+ (eq_attr type bfm,extend,shift_imm,shift_reg,rbit,rev))
thunderx_pipe0 | thunderx_pipe1)
@@ -66,12 +66,18 @@ (define_insn_reservation thunderx_mul
(eq_attr type mul,muls,mla,mlas,clz,smull,umull,smlal,umlal))
thunderx_pipe1 + thunderx_mult)
-;; Multiply high instructions take an extra cycle and cause the muliply unit to
-;; be busy for an extra cycle.
+;; crcb,crch,crcw is 4 cycles and can only happen on pipe 1
-;(define_insn_reservation thunderx_mul_high 5
+(define_insn_reservation thunderx_crc32 4
+ (and (eq_attr tune thunderx)
+ (eq_attr type crc))
+ thunderx_pipe1 + thunderx_mult)
+
+;; crcx is 5 cycles and only happen on pipe 1
+;(define_insn_reservation thunderx_crc64 5
; (and (eq_attr tune thunderx)
-; (eq_attr type smull,umull))
+; (eq_attr type crc)
+; (eq_attr mode DI))
; thunderx_pipe1 + thunderx_mult)
(define_insn_reservation thunderx_div32 22
@@ -97,6 +103,11 @@ (define_insn_reservation thunderx_store
(eq_attr type store2))
thunderx_pipe0 + thunderx_pipe1)
+;; Prefetch are single issued
+;(define_insn_reservation thunderx_prefetch 1
+; (and (eq_attr tune thunderx)
+; (eq_attr type prefetch))
+; thunderx_pipe0 + thunderx_pipe1)
;; loads (and load pairs) from L1 take 3 cycles in pipe 0
(define_insn_reservation thunderx_load 3
@@ -121,10 +132,21 @@ (define_insn_reservation thunderx_fcons
(eq_attr type fconsts,fconstd))
thunderx_pipe1)
-;; Moves between fp are 2 cycles including min/max/select/abs/neg
+;; Moves between fp are 2 cycles including min/max
(define_insn_reservation thunderx_fmov 2
(and (eq_attr tune thunderx)
- (eq_attr type fmov,f_minmaxs,f_minmaxd,fcsel,ffarithd,ffariths))
+ (eq_attr type fmov,f_minmaxs,f_minmaxd))
+ thunderx_pipe1)
+
+;; ABS, and NEG are 1 cycle
+(define_insn_reservation thunderx_fabs 1
+ (and (eq_attr tune thunderx)
+ (eq_attr type ffariths,ffarithd))
+ thunderx_pipe1)
+
+(define_insn_reservation thunderx_fcsel 3
+ (and (eq_attr tune thunderx)
+ (eq_attr type fcsel))
thunderx_pipe1)
(define_insn_reservation thunderx_fmovgpr 2
@@ -132,6 +154,11 @@ (define_insn_reservation thunderx_fmovg
(eq_attr type f_mrc, f_mcr))
thunderx_pipe1)
+(define_insn_reservation thunderx_fcmp 3
+ (and (eq_attr tune thunderx)
+ (eq_attr type fcmps,fcmpd))
+ thunderx_pipe1)
+
(define_insn_reservation thunderx_fmul 6
(and (eq_attr tune thunderx)
(eq_attr type fmacs,fmacd,fmuls,fmuld))
@@ -152,21 +179,21 @@ (define_insn_reservation thunderx_fsqrt
(eq_attr type fsqrts))
thunderx_pipe1 + thunderx_divide, thunderx_divide*13)
-(define_insn_reservation thunderx_fsqrtd 28
+(define_insn_reservation thunderx_fsqrtd 31
(and (eq_attr tune thunderx)
(eq_attr type fsqrtd))
- thunderx_pipe1 + thunderx_divide, thunderx_divide*31)
+ thunderx_pipe1 + thunderx_divide, thunderx_divide*27)
;; The rounding conversion inside fp is 4 cycles
(define_insn_reservation thunderx_frint 4
(and (eq_attr tune