Re: [PATCH v2] LoongArch: Implement FCCmode reload and cstore4

2023-12-21 Thread Jiahao Xu
SPECCPU 2017 and SPECCPU 2006 successfully built and tested, and this 
patch gives a 1.3% improvement in SPECCPU 2017 fprate on 3A6000, no 
performance regression was found. This is an effective optimization and 
looks good.


在 2023/12/15 下午4:57, Xi Ruoyao 写道:

We used a branch to load floating-point comparison results into GPR.
This is very slow when the branch is not predictable.

Implement movfcc so we can reload FCCmode into GPRs, FPRs, and MEM.
Then implement cstore4.

gcc/ChangeLog:

* config/loongarch/loongarch-tune.h
(loongarch_rtx_cost_data::movcf2gr): New field.
(loongarch_rtx_cost_data::movcf2gr_): New method.
(loongarch_rtx_cost_data::use_movcf2gr): New method.
* config/loongarch/loongarch-def.cc
(loongarch_rtx_cost_data::loongarch_rtx_cost_data): Set movcf2gr
to COSTS_N_INSNS (7) and movgr2cf to COSTS_N_INSNS (15), based
on timing on LA464.
(loongarch_cpu_rtx_cost_data): Set movcf2gr and movgr2cf to
COSTS_N_INSNS (1) for LA664.
(loongarch_rtx_cost_optimize_size): Set movcf2gr and movgr2cf to
COSTS_N_INSNS (1) + 1.
* config/loongarch/predicates.md (loongarch_fcmp_operator): New
predicate.
* config/loongarch/loongarch.md (movfcc): Change to
define_expand.
(movfcc_internal): New define_insn.
(fcc_to_): New define_insn.
(cstore4): New define_expand.
* config/loongarch/loongarch.cc
(loongarch_hard_regno_mode_ok_uncached): Allow FCCmode in GPRs
and GPRs.
(loongarch_secondary_reload): Reload FCCmode via FPR and/or GPR.
(loongarch_emit_float_compare): Call gen_reg_rtx instead of
loongarch_allocate_fcc.
(loongarch_allocate_fcc): Remove.
(loongarch_move_to_gpr_cost): Handle FCC_REGS -> GR_REGS.
(loongarch_move_from_gpr_cost): Handle GR_REGS -> FCC_REGS.
(loongarch_register_move_cost): Handle FCC_REGS -> FCC_REGS,
FCC_REGS -> FP_REGS, and FP_REGS -> FCC_REGS.

gcc/testsuite/ChangeLog:

* gcc.target/loongarch/movcf2gr.c: New test.
* gcc.target/loongarch/movcf2gr-via-fr.c: New test.
---

Superseds
https://gcc.gnu.org/pipermail/gcc-patches/2023-December/640497.html.

Bootstrapped and regtested on loongarch64-linux-gnu.  Ok for trunk?

  gcc/config/loongarch/loongarch-def.cc | 13 +++-
  gcc/config/loongarch/loongarch-tune.h | 15 +++-
  gcc/config/loongarch/loongarch.cc | 70 ---
  gcc/config/loongarch/loongarch.md | 69 --
  gcc/config/loongarch/predicates.md|  4 ++
  .../gcc.target/loongarch/movcf2gr-via-fr.c| 10 +++
  gcc/testsuite/gcc.target/loongarch/movcf2gr.c |  9 +++
  7 files changed, 157 insertions(+), 33 deletions(-)
  create mode 100644 gcc/testsuite/gcc.target/loongarch/movcf2gr-via-fr.c
  create mode 100644 gcc/testsuite/gcc.target/loongarch/movcf2gr.c

diff --git a/gcc/config/loongarch/loongarch-def.cc 
b/gcc/config/loongarch/loongarch-def.cc
index 4a8885e8343..843be78e46e 100644
--- a/gcc/config/loongarch/loongarch-def.cc
+++ b/gcc/config/loongarch/loongarch-def.cc
@@ -101,15 +101,21 @@ loongarch_rtx_cost_data::loongarch_rtx_cost_data ()
  int_mult_di (COSTS_N_INSNS (4)),
  int_div_si (COSTS_N_INSNS (5)),
  int_div_di (COSTS_N_INSNS (5)),
+movcf2gr (COSTS_N_INSNS (7)),
+movgr2cf (COSTS_N_INSNS (15)),
  branch_cost (6),
  memory_latency (4) {}
  
  /* The following properties cannot be looked up directly using "cpucfg".

   So it is necessary to provide a default value for "unknown native"
   tune targets (i.e. -mtune=native while PRID does not correspond to
- any known "-mtune" type).  Currently all numbers are default.  */
+ any known "-mtune" type).  */
  array_tune loongarch_cpu_rtx_cost_data =
-  array_tune ();
+  array_tune ()
+.set (CPU_LA664,
+ loongarch_rtx_cost_data ()
+   .movcf2gr_ (COSTS_N_INSNS (1))
+   .movgr2cf_ (COSTS_N_INSNS (1)));
  
  /* RTX costs to use when optimizing for size.

 We use a value slightly larger than COSTS_N_INSNS (1) for all of them
@@ -125,7 +131,8 @@ const loongarch_rtx_cost_data 
loongarch_rtx_cost_optimize_size =
  .int_mult_si_ (COST_COMPLEX_INSN)
  .int_mult_di_ (COST_COMPLEX_INSN)
  .int_div_si_ (COST_COMPLEX_INSN)
-.int_div_di_ (COST_COMPLEX_INSN);
+.int_div_di_ (COST_COMPLEX_INSN)
+.movcf2gr_ (COST_COMPLEX_INSN);
  
  array_tune loongarch_cpu_issue_rate = array_tune ()

.set (CPU_NATIVE, 4)
diff --git a/gcc/config/loongarch/loongarch-tune.h 
b/gcc/config/loongarch/loongarch-tune.h
index 4aa01c54c08..7a75c8dd9d9 100644
--- a/gcc/config/loongarch/loongarch-tune.h
+++ b/gcc/config/loongarch/loongarch-tune.h
@@ -35,6 +35,8 @@ struct loongarch_rtx_cost_data
unsigned short int_mult_di;
unsigned short int_div_si;
unsigned short int_div_di;
+  unsigned short movcf2gr;
+  unsigned short movgr2cf;
unsigned short 

[PATCH v2] LoongArch: Implement FCCmode reload and cstore4

2023-12-15 Thread Xi Ruoyao
We used a branch to load floating-point comparison results into GPR.
This is very slow when the branch is not predictable.

Implement movfcc so we can reload FCCmode into GPRs, FPRs, and MEM.
Then implement cstore4.

gcc/ChangeLog:

* config/loongarch/loongarch-tune.h
(loongarch_rtx_cost_data::movcf2gr): New field.
(loongarch_rtx_cost_data::movcf2gr_): New method.
(loongarch_rtx_cost_data::use_movcf2gr): New method.
* config/loongarch/loongarch-def.cc
(loongarch_rtx_cost_data::loongarch_rtx_cost_data): Set movcf2gr
to COSTS_N_INSNS (7) and movgr2cf to COSTS_N_INSNS (15), based
on timing on LA464.
(loongarch_cpu_rtx_cost_data): Set movcf2gr and movgr2cf to
COSTS_N_INSNS (1) for LA664.
(loongarch_rtx_cost_optimize_size): Set movcf2gr and movgr2cf to
COSTS_N_INSNS (1) + 1.
* config/loongarch/predicates.md (loongarch_fcmp_operator): New
predicate.
* config/loongarch/loongarch.md (movfcc): Change to
define_expand.
(movfcc_internal): New define_insn.
(fcc_to_): New define_insn.
(cstore4): New define_expand.
* config/loongarch/loongarch.cc
(loongarch_hard_regno_mode_ok_uncached): Allow FCCmode in GPRs
and GPRs.
(loongarch_secondary_reload): Reload FCCmode via FPR and/or GPR.
(loongarch_emit_float_compare): Call gen_reg_rtx instead of
loongarch_allocate_fcc.
(loongarch_allocate_fcc): Remove.
(loongarch_move_to_gpr_cost): Handle FCC_REGS -> GR_REGS.
(loongarch_move_from_gpr_cost): Handle GR_REGS -> FCC_REGS.
(loongarch_register_move_cost): Handle FCC_REGS -> FCC_REGS,
FCC_REGS -> FP_REGS, and FP_REGS -> FCC_REGS.

gcc/testsuite/ChangeLog:

* gcc.target/loongarch/movcf2gr.c: New test.
* gcc.target/loongarch/movcf2gr-via-fr.c: New test.
---

Superseds
https://gcc.gnu.org/pipermail/gcc-patches/2023-December/640497.html.

Bootstrapped and regtested on loongarch64-linux-gnu.  Ok for trunk?

 gcc/config/loongarch/loongarch-def.cc | 13 +++-
 gcc/config/loongarch/loongarch-tune.h | 15 +++-
 gcc/config/loongarch/loongarch.cc | 70 ---
 gcc/config/loongarch/loongarch.md | 69 --
 gcc/config/loongarch/predicates.md|  4 ++
 .../gcc.target/loongarch/movcf2gr-via-fr.c| 10 +++
 gcc/testsuite/gcc.target/loongarch/movcf2gr.c |  9 +++
 7 files changed, 157 insertions(+), 33 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/loongarch/movcf2gr-via-fr.c
 create mode 100644 gcc/testsuite/gcc.target/loongarch/movcf2gr.c

diff --git a/gcc/config/loongarch/loongarch-def.cc 
b/gcc/config/loongarch/loongarch-def.cc
index 4a8885e8343..843be78e46e 100644
--- a/gcc/config/loongarch/loongarch-def.cc
+++ b/gcc/config/loongarch/loongarch-def.cc
@@ -101,15 +101,21 @@ loongarch_rtx_cost_data::loongarch_rtx_cost_data ()
 int_mult_di (COSTS_N_INSNS (4)),
 int_div_si (COSTS_N_INSNS (5)),
 int_div_di (COSTS_N_INSNS (5)),
+movcf2gr (COSTS_N_INSNS (7)),
+movgr2cf (COSTS_N_INSNS (15)),
 branch_cost (6),
 memory_latency (4) {}
 
 /* The following properties cannot be looked up directly using "cpucfg".
  So it is necessary to provide a default value for "unknown native"
  tune targets (i.e. -mtune=native while PRID does not correspond to
- any known "-mtune" type).  Currently all numbers are default.  */
+ any known "-mtune" type).  */
 array_tune loongarch_cpu_rtx_cost_data =
-  array_tune ();
+  array_tune ()
+.set (CPU_LA664,
+ loongarch_rtx_cost_data ()
+   .movcf2gr_ (COSTS_N_INSNS (1))
+   .movgr2cf_ (COSTS_N_INSNS (1)));
 
 /* RTX costs to use when optimizing for size.
We use a value slightly larger than COSTS_N_INSNS (1) for all of them
@@ -125,7 +131,8 @@ const loongarch_rtx_cost_data 
loongarch_rtx_cost_optimize_size =
 .int_mult_si_ (COST_COMPLEX_INSN)
 .int_mult_di_ (COST_COMPLEX_INSN)
 .int_div_si_ (COST_COMPLEX_INSN)
-.int_div_di_ (COST_COMPLEX_INSN);
+.int_div_di_ (COST_COMPLEX_INSN)
+.movcf2gr_ (COST_COMPLEX_INSN);
 
 array_tune loongarch_cpu_issue_rate = array_tune ()
   .set (CPU_NATIVE, 4)
diff --git a/gcc/config/loongarch/loongarch-tune.h 
b/gcc/config/loongarch/loongarch-tune.h
index 4aa01c54c08..7a75c8dd9d9 100644
--- a/gcc/config/loongarch/loongarch-tune.h
+++ b/gcc/config/loongarch/loongarch-tune.h
@@ -35,6 +35,8 @@ struct loongarch_rtx_cost_data
   unsigned short int_mult_di;
   unsigned short int_div_si;
   unsigned short int_div_di;
+  unsigned short movcf2gr;
+  unsigned short movgr2cf;
   unsigned short branch_cost;
   unsigned short memory_latency;
 
@@ -95,6 +97,18 @@ struct loongarch_rtx_cost_data
 return *this;
   }
 
+  loongarch_rtx_cost_data movcf2gr_ (unsigned short _movcf2gr)
+  {
+movcf2gr = _movcf2gr;
+return *this;
+  }
+
+  loongarch_rtx_cost_data movgr2cf_ (unsigned short