[PATCH] [APX ZU] Support APX zero-upper

2024-06-06 Thread Kong, Lingling
Enable ZU for IMUL (opcodes 0x69 and 0x6B) and SETcc.

gcc/ChangeLog:

* config/i386/i386-opts.h (enum apx_features):Add apx_zu.
* config/i386/i386.h (TARGET_APX_ZU): Define.
* config/i386/i386.md (*imulhizu): New define_insn.
(*setcc__zu): Ditto.
* config/i386/i386.opt: Add enum value for zu.

gcc/testsuite/ChangeLog:

* gcc.target/i386/apx-zu-1.c: New test.
* gcc.target/i386/apx-zu-2.c: Ditto.

Bootstrapped & regtested on x86-64-pc-linux-gnu with binutils 2.42 branch.
OK for trunk?

---
 gcc/config/i386/i386-opts.h  |  3 +-
 gcc/config/i386/i386.h   |  1 +
 gcc/config/i386/i386.md  | 25 ++--
 gcc/config/i386/i386.opt |  3 ++
 gcc/testsuite/gcc.target/i386/apx-zu-1.c | 38   
gcc/testsuite/gcc.target/i386/apx-zu-2.c | 19 
 6 files changed, 86 insertions(+), 3 deletions(-)  create mode 100644 
gcc/testsuite/gcc.target/i386/apx-zu-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/apx-zu-2.c

diff --git a/gcc/config/i386/i386-opts.h b/gcc/config/i386/i386-opts.h index 
5fcc4927978..c7ec0d9fd39 100644
--- a/gcc/config/i386/i386-opts.h
+++ b/gcc/config/i386/i386-opts.h
@@ -142,8 +142,9 @@ enum apx_features {
   apx_ppx = 1 << 3,
   apx_nf = 1 << 4,
   apx_ccmp = 1 << 5,
+  apx_zu = 1 << 6,
   apx_all = apx_egpr | apx_push2pop2 | apx_ndd
-   | apx_ppx | apx_nf | apx_ccmp,
+   | apx_ppx | apx_nf | apx_ccmp | apx_zu,
 };
 
 #endif
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 
7051c6c13e4..dc1a1f44320 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -57,6 +57,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If 
not, see  #define TARGET_APX_PPX (ix86_apx_features & apx_ppx)  #define 
TARGET_APX_NF (ix86_apx_features & apx_nf) 
#define TARGET_APX_CCMP (ix86_apx_features & apx_ccmp)
+#define TARGET_APX_ZU (ix86_apx_features & apx_zu)
 
 #include "config/vxworks-dummy.h"
 
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
ffcf63e1cba..a2765f65754 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -9967,6 +9967,19 @@
(const_string "direct")))
(set_attr "mode" "")])
 
+(define_insn "*imulhizu"
+  [(set (match_operand:SWI48x 0 "register_operand" "=r,r")
+   (zero_extend:SWI48x
+ (mult:HI (match_operand:HI 1 "nonimmediate_operand" "%rm,rm")
+  (match_operand:HI 2 "immediate_operand" "K,n"
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_APX_ZU"
+  "@
+   imulzu{w}\t{%2, %1, %w0|%w0, %1, %2}
+   imulzu{w}\t{%2, %1, %w0|%w0, %1, %2}"
+  [(set_attr "type" "imul")
+   (set_attr "mode" "HI")])
+
 (define_insn "*mulsi3_1_zext"
   [(set (match_operand:DI 0 "register_operand" "=r,r,r")
(zero_extend:DI
@@ -18354,11 +18367,19 @@
 ;; For all sCOND expanders, also expand the compare or test insn that  ;; 
generates cc0.  Generate an equality comparison if `seq' or `sne'.
 
+(define_insn "*setcc__zu"
+  [(set (match_operand:SWI248 0 "register_operand" "=r")
+   (match_operator:SWI248 1 "ix86_comparison_operator"
+ [(reg FLAGS_REG) (const_int 0)]))]
+  "TARGET_APX_ZU"
+  "setzu%C1\t%b0"
+  [(set_attr "type" "setcc")])
+
 (define_insn_and_split "*setcc_di_1"
   [(set (match_operand:DI 0 "register_operand" "=q")
(match_operator:DI 1 "ix86_comparison_operator"
  [(reg FLAGS_REG) (const_int 0)]))]
-  "TARGET_64BIT && !TARGET_PARTIAL_REG_STALL"
+  "!TARGET_APX_ZU && TARGET_64BIT && !TARGET_PARTIAL_REG_STALL"
   "#"
   "&& reload_completed"
   [(set (match_dup 2) (match_dup 1))
@@ -18391,7 +18412,7 @@
   [(set (match_operand:SWI24 0 "register_operand" "=q")
(match_operator:SWI24 1 "ix86_comparison_operator"
  [(reg FLAGS_REG) (const_int 0)]))]
-  "!TARGET_PARTIAL_REG_STALL
+  "!TARGET_APX_ZU && !TARGET_PARTIAL_REG_STALL
&& (!TARGET_ZERO_EXTEND_WITH_AND || optimize_function_for_size_p (cfun))"
   "#"
   "&& reload_completed"
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt index 
7017cc87cec..353fffb2343 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1342,6 +1342,9 @@ Enum(apx_features) String(nf) Value(apx_nf) Set(6)  
EnumValue
 Enum(apx_features) String(ccmp) Value(apx_ccmp) Set(7)
 
+EnumValue
+Enum(apx_features) String(zu) Value(apx_zu) Set(8)
+
 EnumValue
 Enum(apx_features) String(all) Value(apx_all) Set(1)
 
diff --git a/gcc/testsuite/gcc.target/i386/apx-zu-1.c 
b/gcc/testsuite/gcc.target/i386/apx-zu-1.c
new file mode 100644
index 000..927a87673a7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/apx-zu-1.c
@@ -0,0 +1,38 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mapxf -march=x86-64 -O2" } */
+/* { dg-final { scan-assembler-not "setle"} } */
+/* { dg-final { scan-assembler-not "setge"} } */
+/* { dg-final { scan-assembler-not "sete"} } */
+/* { dg-final { scan-assembler-not "xor"} } */
+/* 

[PATCH v3 6/8] [APX NF] Support APX NF for shld/shrd

2024-05-28 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (x86_64_shld_nf): New define_insn.
(x86_64_shld_ndd_nf): Ditto.
(x86_64_shld_1_nf): Ditto.
(x86_64_shld_ndd_1_nf): Ditto.
(*x86_64_shld_shrd_1_nozext_nf): Ditto.
(x86_shld_nf): Ditto.
(x86_shld_ndd_nf): Ditto.
(x86_shld_1_nf): Ditto.
(x86_shld_ndd_1_nf): Ditto.
(*x86_shld_shrd_1_nozext_nf): Ditto.
(3_doubleword_lowpart_nf): Ditto.
(x86_64_shrd_nf): Ditto.
(x86_64_shrd_ndd_nf): Ditto.
(x86_64_shrd_1_nf): Ditto.
(x86_64_shrd_ndd_1_nf): Ditto.
(*x86_64_shrd_shld_1_nozext_nf): Ditto.
(x86_shrd_nf): Ditto.
(x86_shrd_ndd_nf): Ditto.
(x86_shrd_1_nf): Ditto.
(x86_shrd_ndd_1_nf): Ditto.
(*x86_shrd_shld_1_nozext_nf): Ditto.
---
 gcc/config/i386/i386.md | 377 +++-
 1 file changed, 296 insertions(+), 81 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 9d518e90d07..719cce7d3ef 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -14551,7 +14551,7 @@
   DONE;
 })
 
-(define_insn "x86_64_shld"
+(define_insn "x86_64_shld"
   [(set (match_operand:DI 0 "nonimmediate_operand" "+r*m")
 (ior:DI (ashift:DI (match_dup 0)
  (and:QI (match_operand:QI 2 "nonmemory_operand" "Jc")
@@ -14561,10 +14561,9 @@
(zero_extend:TI
  (match_operand:DI 1 "register_operand" "r"))
(minus:QI (const_int 64)
- (and:QI (match_dup 2) (const_int 63 0)))
-   (clobber (reg:CC FLAGS_REG))]
-  "TARGET_64BIT"
-  "shld{q}\t{%2, %1, %0|%0, %1, %2}"
+ (and:QI (match_dup 2) (const_int 63 0)))]
+  "TARGET_64BIT && "
+  "shld{q}\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "type" "ishift")
(set_attr "prefix_0f" "1")
(set_attr "mode" "DI")
@@ -14572,7 +14571,7 @@
(set_attr "amdfam10_decode" "vector")
(set_attr "bdver1_decode" "vector")])
 
-(define_insn "x86_64_shld_ndd"
+(define_insn "x86_64_shld_ndd"
   [(set (match_operand:DI 0 "register_operand" "=r")
 (ior:DI (ashift:DI (match_operand:DI 1 "nonimmediate_operand" "rm")
  (and:QI (match_operand:QI 3 "nonmemory_operand" "Jc")
@@ -14582,14 +14581,13 @@
(zero_extend:TI
  (match_operand:DI 2 "register_operand" "r"))
(minus:QI (const_int 64)
- (and:QI (match_dup 3) (const_int 63 0)))
-   (clobber (reg:CC FLAGS_REG))]
-  "TARGET_APX_NDD"
-  "shld{q}\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ (and:QI (match_dup 3) (const_int 63 0)))]
+  "TARGET_APX_NDD && "
+  "shld{q}\t{%3, %2, %1, %0|%0, %1, %2, %3}"
   [(set_attr "type" "ishift")
(set_attr "mode" "DI")])
 
-(define_insn "x86_64_shld_1"
+(define_insn "x86_64_shld_1"
   [(set (match_operand:DI 0 "nonimmediate_operand" "+r*m")
 (ior:DI (ashift:DI (match_dup 0)
   (match_operand:QI 2 "const_0_to_63_operand"))
@@ -14597,11 +14595,11 @@
  (lshiftrt:TI
(zero_extend:TI
  (match_operand:DI 1 "register_operand" "r"))
-   (match_operand:QI 3 "const_0_to_255_operand")) 0)))
-   (clobber (reg:CC FLAGS_REG))]
+   (match_operand:QI 3 "const_0_to_255_operand")) 0)))]
   "TARGET_64BIT
-   && INTVAL (operands[3]) == 64 - INTVAL (operands[2])"
-  "shld{q}\t{%2, %1, %0|%0, %1, %2}"
+   && INTVAL (operands[3]) == 64 - INTVAL (operands[2])
+   && "
+  "shld{q}\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "type" "ishift")
(set_attr "prefix_0f" "1")
(set_attr "mode" "DI")
@@ -14610,7 +14608,7 @@
(set_attr "amdfam10_decode" "vector")
(set_attr "bdver1_decode" "vector")])
 
-(define_insn "x86_64_shld_ndd_1"
+(define_insn "x86_64_shld_ndd_1"
   [(set (match_operand:DI 0 "register_operand" "=r")
 (ior:DI (ashift:DI (match_operand:DI 1 "nonimmediate_operand" "rm")
   (match_operand:QI 3 "const_0_to_63_operand"))
@@ -14618,15 +14616,66 @@
  (lshiftrt:TI
(zero_extend:TI
  (match_operand:DI 2 "register_operand" "r"))
-   (match_operand:QI 4 "const_0_to_255_operand")) 0)))
-   (clobber (reg:CC FLAGS_REG))]
+   (match_operand:QI 4 "const_0_to_255_operand")) 0)))]
   "TARGET_APX_NDD
-   && INTVAL (operands[4]) == 64 - INTVAL (operands[3])"
-  "shld{q}\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+   && INTVAL (operands[4]) == 64 - INTVAL (operands[3])
+   && "
+  "shld{q}\t{%3, %2, %1, %0|%0, %1, %2, %3}"
   [(set_attr "type" "ishift")
(set_attr "mode" "DI")
(set_attr "length_immediate" "1")])
 
+(define_insn_and_split "*x86_64_shld_shrd_1_nozext_nf"
+  [(set (match_operand:DI 0 "nonimmediate_operand")
+   (ior:DI (ashift:DI (match_operand:DI 4 

[PATCH v3 7/8] [APX NF] Support APX NF for mul/div

2024-05-28 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (*mul3_1_nf): New define_insn.
(*mulqi3_1_nf): Ditto.
(*divmod4_noext_nf): Ditto.
(divmodhiqi3_nf): Ditto.
---
 gcc/config/i386/i386.md | 47 ++---
 1 file changed, 30 insertions(+), 17 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 719cce7d3ef..e688e92785e 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -9898,17 +9898,17 @@
 ;;
 ;; On BDVER1, all HI MULs use DoublePath
 
-(define_insn "*mul3_1"
+(define_insn "*mul3_1"
   [(set (match_operand:SWIM248 0 "register_operand" "=r,r,r")
(mult:SWIM248
  (match_operand:SWIM248 1 "nonimmediate_operand" "%rm,rm,0")
- (match_operand:SWIM248 2 "" "K,,r")))
-   (clobber (reg:CC FLAGS_REG))]
-  "!(MEM_P (operands[1]) && MEM_P (operands[2]))"
+ (match_operand:SWIM248 2 "" "K,,r")))]
+  "!(MEM_P (operands[1]) && MEM_P (operands[2]))
+   && "
   "@
-   imul{}\t{%2, %1, %0|%0, %1, %2}
-   imul{}\t{%2, %1, %0|%0, %1, %2}
-   imul{}\t{%2, %0|%0, %2}"
+   imul{}\t{%2, %1, %0|%0, %1, %2}
+   imul{}\t{%2, %1, %0|%0, %1, %2}
+   imul{}\t{%2, %0|%0, %2}"
   [(set_attr "type" "imul")
(set_attr "prefix_0f" "0,0,1")
(set (attr "athlon_decode")
@@ -9969,14 +9969,14 @@
 ;; MUL reg8Direct
 ;; MUL mem8Direct
 
-(define_insn "*mulqi3_1"
+(define_insn "*mulqi3_1"
   [(set (match_operand:QI 0 "register_operand" "=a")
(mult:QI (match_operand:QI 1 "nonimmediate_operand" "%0")
-(match_operand:QI 2 "nonimmediate_operand" "qm")))
-   (clobber (reg:CC FLAGS_REG))]
+(match_operand:QI 2 "nonimmediate_operand" "qm")))]
   "TARGET_QIMODE_MATH
-   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
-  "mul{b}\t%2"
+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))
+   && "
+  "mul{b}\t%2"
   [(set_attr "type" "imul")
(set_attr "length_immediate" "0")
(set (attr "athlon_decode")
@@ -9,6 +9,19 @@
   [(set_attr "type" "multi")
(set_attr "mode" "SI")])
 
+(define_insn "*divmod4_noext_nf"
+  [(set (match_operand:SWIM248 0 "register_operand" "=a")
+   (any_div:SWIM248
+ (match_operand:SWIM248 2 "register_operand" "0")
+ (match_operand:SWIM248 3 "nonimmediate_operand" "rm")))
+   (set (match_operand:SWIM248 1 "register_operand" "=d")
+   (:SWIM248 (match_dup 2) (match_dup 3)))
+   (use (match_operand:SWIM248 4 "register_operand" "1"))]
+  "TARGET_APX_NF"
+  "%{nf%} div{}\t%3"
+  [(set_attr "type" "idiv")
+   (set_attr "mode" "")])
+
 (define_insn "*divmod4_noext"
   [(set (match_operand:SWIM248 0 "register_operand" "=a")
(any_div:SWIM248
@@ -11266,7 +11279,7 @@
 ;; Change div/mod to HImode and extend the second argument to HImode
 ;; so that mode of div/mod matches with mode of arguments.  Otherwise
 ;; combine may fail.
-(define_insn "divmodhiqi3"
+(define_insn "divmodhiqi3"
   [(set (match_operand:HI 0 "register_operand" "=a")
(ior:HI
  (ashift:HI
@@ -11278,10 +11291,10 @@
(const_int 8))
  (zero_extend:HI
(truncate:QI
- (div:HI (match_dup 1) (any_extend:HI (match_dup 2)))
-   (clobber (reg:CC FLAGS_REG))]
-  "TARGET_QIMODE_MATH"
-  "div{b}\t%2"
+ (div:HI (match_dup 1) (any_extend:HI (match_dup 2)))]
+  "TARGET_QIMODE_MATH
+   && "
+  "div{b}\t%2"
   [(set_attr "type" "idiv")
(set_attr "mode" "QI")])
 
-- 
2.31.1



[PATCH v3 4/8] [APX NF] Support APX NF for right shift insns

2024-05-28 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (*ashr3_1_nf): New.
(*lshr3_1_nf): Ditto.
(*lshrqi3_1_nf): Ditto.
(*lshrhi3_1_nf): Ditto.
---
 gcc/config/i386/i386.md | 82 +++--
 1 file changed, 46 insertions(+), 36 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 4c06c243cc3..d10caf04fcc 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -16323,13 +16323,13 @@
   [(set_attr "type" "ishiftx")
(set_attr "mode" "")])
 
-(define_insn "*ashr3_1"
+(define_insn "*ashr3_1"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r")
(ashiftrt:SWI48
  (match_operand:SWI48 1 "nonimmediate_operand" "0,rm,rm")
- (match_operand:QI 2 "nonmemory_operand" "c,r,c")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (ASHIFTRT, mode, operands, TARGET_APX_NDD)"
+ (match_operand:QI 2 "nonmemory_operand" "c,r,c")))]
+  "ix86_binary_operator_ok (ASHIFTRT, mode, operands, TARGET_APX_NDD)
+   && "
 {
   bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
   switch (get_attr_type (insn))
@@ -16340,11 +16340,11 @@
 default:
   if (operands[2] == const1_rtx
  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))
- && !use_ndd)
+ && !use_ndd && !)
return "sar{}\t%0";
   else
-   return use_ndd ? "sar{}\t{%2, %1, %0|%0, %1, %2}"
-  : "sar{}\t{%2, %0|%0, %2}";
+   return use_ndd ? "sar{}\t{%2, %1, %0|%0, %1, 
%2}"
+  : "sar{}\t{%2, %0|%0, %2}";
 }
 }
   [(set_attr "isa" "*,bmi2,apx_ndd")
@@ -16384,14 +16384,13 @@
 }
 [(set_attr "isa" "*,*,*,apx_ndd")])
 
-
-(define_insn "*lshr3_1"
+(define_insn "*lshr3_1"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,?k,r")
(lshiftrt:SWI48
  (match_operand:SWI48 1 "nonimmediate_operand" "0,rm,k,rm")
- (match_operand:QI 2 "nonmemory_operand" "c,r,,c")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (LSHIFTRT, mode, operands, TARGET_APX_NDD)"
+ (match_operand:QI 2 "nonmemory_operand" "c,r,,c")))]
+  "ix86_binary_operator_ok (LSHIFTRT, mode, operands, TARGET_APX_NDD)
+   && "
 {
   bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
   switch (get_attr_type (insn))
@@ -16403,11 +16402,11 @@
 default:
   if (operands[2] == const1_rtx
  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))
- && !use_ndd)
+ && !use_ndd && !)
return "shr{}\t%0";
   else
-   return use_ndd ? "shr{}\t{%2, %1, %0|%0, %1, %2}"
-  : "shr{}\t{%2, %0|%0, %2}";
+   return use_ndd ? "shr{}\t{%2, %1, %0|%0, %1, 
%2}"
+  : "shr{}\t{%2, %0|%0, %2}";
 }
 }
   [(set_attr "isa" "*,bmi2,avx512bw,apx_ndd")
@@ -16423,6 +16422,17 @@
(set_attr "mode" "")])
 
 ;; Convert shift to the shiftx pattern to avoid flags dependency.
+;; For NF/NDD doesn't support shift count as r, it just support c,
+;; and it has no flag.
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand")
+   (any_shiftrt:SWI48 (match_operand:SWI48 1 "nonimmediate_operand")
+  (match_operand:QI 2 "register_operand")))]
+  "TARGET_BMI2 && reload_completed"
+  [(set (match_dup 0)
+   (any_shiftrt:SWI48 (match_dup 1) (match_dup 2)))]
+  "operands[2] = gen_lowpart (mode, operands[2]);")
+
 (define_split
   [(set (match_operand:SWI48 0 "register_operand")
(any_shiftrt:SWI48 (match_operand:SWI48 1 "nonimmediate_operand")
@@ -16491,22 +16501,22 @@
(zero_extend:DI (any_shiftrt:SI (match_dup 1) (match_dup 2]
   "operands[2] = gen_lowpart (SImode, operands[2]);")
 
-(define_insn "*ashr3_1"
+(define_insn "*ashr3_1"
   [(set (match_operand:SWI12 0 "nonimmediate_operand" "=m, r")
(ashiftrt:SWI12
  (match_operand:SWI12 1 "nonimmediate_operand" "0, rm")
- (match_operand:QI 2 "nonmemory_operand" "c, c")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (ASHIFTRT, mode, operands, TARGET_APX_NDD)"
+ (match_operand:QI 2 "nonmemory_operand" "c, c")))]
+  "ix86_binary_operator_ok (ASHIFTRT, mode, operands, TARGET_APX_NDD)
+   && "
 {
   bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
   if (operands[2] == const1_rtx
   && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))
-  && !use_ndd)
+  && !use_ndd && !)
 return "sar{}\t%0";
   else
-return use_ndd ? "sar{}\t{%2, %1, %0|%0, %1, %2}"
-  : "sar{}\t{%2, %0|%0, %2}";
+return use_ndd ? "sar{}\t{%2, %1, %0|%0, %1, %2}"
+  : "sar{}\t{%2, %0|%0, %2}";
 }
   [(set_attr "isa" "*, apx_ndd")
(set_attr "type" "ishift")
@@ -16519,13 +16529,13 @@
(const_string "*")))
(set_attr "mode" "")])
 
-(define_insn "*lshrqi3_1"
+(define_insn "*lshrqi3_1"
   [(set (match_operand:QI 0 "nonimmediate_operand"  "=qm,?k,r")
(lshiftrt:QI
  

[PATCH v3 8/8] [APX NF] Support APX NF for lzcnt/tzcnt/popcnt

2024-05-28 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (clz2_lzcnt_nf): New define_insn.
(*clz2_lzcnt_falsedep_nf): Ditto.
(__nf): Ditto.
(*__falsedep_nf): Ditto.
(_hi_nf): Ditto.
(popcount2_nf): Ditto.
(*popcount2_falsedep_nf): Ditto.
(popcounthi2_nf): Ditto.
---
 gcc/config/i386/i386.md | 124 
 1 file changed, 113 insertions(+), 11 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index e688e92785e..b0eb497cd23 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -20269,6 +20269,24 @@
   operands[3] = gen_reg_rtx (mode);
 })
 
+(define_insn_and_split "clz2_lzcnt_nf"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+   (clz:SWI48
+ (match_operand:SWI48 1 "nonimmediate_operand" "rm")))]
+  "TARGET_APX_NF && TARGET_LZCNT"
+  "%{nf%} lzcnt{}\t{%1, %0|%0, %1}"
+  "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed
+   && optimize_function_for_speed_p (cfun)
+   && !reg_mentioned_p (operands[0], operands[1])"
+  [(parallel
+[(set (match_dup 0)
+ (clz:SWI48 (match_dup 1)))
+ (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "ix86_expand_clear (operands[0]);"
+  [(set_attr "prefix_rep" "1")
+   (set_attr "type" "bitmanip")
+   (set_attr "mode" "")])
+
 (define_insn_and_split "clz2_lzcnt"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
(clz:SWI48
@@ -20292,6 +20310,18 @@
 ; False dependency happens when destination is only updated by tzcnt,
 ; lzcnt or popcnt.  There is no false dependency when destination is
 ; also used in source.
+(define_insn "*clz2_lzcnt_falsedep_nf"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+   (clz:SWI48
+ (match_operand:SWI48 1 "nonimmediate_operand" "rm")))
+   (unspec [(match_operand:SWI48 2 "register_operand" "0")]
+  UNSPEC_INSN_FALSE_DEP)]
+  "TARGET_APX_NF && TARGET_LZCNT"
+  "%{nf%} lzcnt{}\t{%1, %0|%0, %1}"
+  [(set_attr "prefix_rep" "1")
+   (set_attr "type" "bitmanip")
+   (set_attr "mode" "")])
+
 (define_insn "*clz2_lzcnt_falsedep"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
(clz:SWI48
@@ -20398,6 +20428,25 @@
 ;; Version of lzcnt/tzcnt that is expanded from intrinsics.  This version
 ;; provides operand size as output when source operand is zero. 
 
+(define_insn_and_split "__nf"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+   (unspec:SWI48
+ [(match_operand:SWI48 1 "nonimmediate_operand" "rm")] LT_ZCNT))]
+  "TARGET_APX_NF"
+  "%{nf%} {}\t{%1, %0|%0, %1}"
+  "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed
+   && optimize_function_for_speed_p (cfun)
+   && !reg_mentioned_p (operands[0], operands[1])"
+  [(parallel
+[(set (match_dup 0)
+ (unspec:SWI48 [(match_dup 1)] LT_ZCNT))
+ (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "ix86_expand_clear (operands[0]);"
+  [(set_attr "type" "")
+   (set_attr "prefix_0f" "1")
+   (set_attr "prefix_rep" "1")
+   (set_attr "mode" "")])
+
 (define_insn_and_split "_"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
(unspec:SWI48
@@ -20422,6 +20471,19 @@
 ; False dependency happens when destination is only updated by tzcnt,
 ; lzcnt or popcnt.  There is no false dependency when destination is
 ; also used in source.
+(define_insn "*__falsedep_nf"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+   (unspec:SWI48
+ [(match_operand:SWI48 1 "nonimmediate_operand" "rm")] LT_ZCNT))
+   (unspec [(match_operand:SWI48 2 "register_operand" "0")]
+  UNSPEC_INSN_FALSE_DEP)]
+  "TARGET_APX_NF"
+  "%{nf%} {}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "")
+   (set_attr "prefix_0f" "1")
+   (set_attr "prefix_rep" "1")
+   (set_attr "mode" "")])
+
 (define_insn "*__falsedep"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
(unspec:SWI48
@@ -20436,13 +20498,12 @@
(set_attr "prefix_rep" "1")
(set_attr "mode" "")])
 
-(define_insn "_hi"
+(define_insn "_hi"
   [(set (match_operand:HI 0 "register_operand" "=r")
(unspec:HI
- [(match_operand:HI 1 "nonimmediate_operand" "rm")] LT_ZCNT))
-   (clobber (reg:CC FLAGS_REG))]
-  ""
-  "{w}\t{%1, %0|%0, %1}"
+ [(match_operand:HI 1 "nonimmediate_operand" "rm")] LT_ZCNT))]
+  ""
+  "{w}\t{%1, %0|%0, %1}"
   [(set_attr "type" "")
(set_attr "prefix_0f" "1")
(set_attr "prefix_rep" "1")
@@ -20860,6 +20921,30 @@
   [(set_attr "type" "bitmanip")
(set_attr "mode" "")])
 
+(define_insn_and_split "popcount2_nf"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+   (popcount:SWI48
+ (match_operand:SWI48 1 "nonimmediate_operand" "rm")))]
+  "TARGET_APX_NF && TARGET_POPCNT"
+{
+#if TARGET_MACHO
+  return "%{nf%} popcnt\t{%1, %0|%0, %1}";
+#else
+  return "%{nf%} popcnt{}\t{%1, %0|%0, %1}";
+#endif
+}
+  "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed
+   && optimize_function_for_speed_p (cfun)
+   && 

[PATCH v3 5/8] [APX NF] Support APX NF for rotate insns

2024-05-28 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (ashr3_cvt_nf): New define_insn.
(*3_1_nf): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/apx-nf.c: Add NF test for rotate insns.
---
 gcc/config/i386/i386.md| 59 +-
 gcc/testsuite/gcc.target/i386/apx-nf.c |  5 +++
 2 files changed, 43 insertions(+), 21 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index d10caf04fcc..9d518e90d07 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -16245,19 +16245,19 @@
 (define_mode_attr cvt_mnemonic
   [(SI "{cltd|cdq}") (DI "{cqto|cqo}")])
 
-(define_insn "ashr3_cvt"
+(define_insn "ashr3_cvt"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=*d,rm,r")
(ashiftrt:SWI48
  (match_operand:SWI48 1 "nonimmediate_operand" "*a,0,rm")
- (match_operand:QI 2 "const_int_operand")))
-   (clobber (reg:CC FLAGS_REG))]
+ (match_operand:QI 2 "const_int_operand")))]
   "INTVAL (operands[2]) == GET_MODE_BITSIZE (mode)-1
&& (TARGET_USE_CLTD || optimize_function_for_size_p (cfun))
-   && ix86_binary_operator_ok (ASHIFTRT, mode, operands, TARGET_APX_NDD)"
+   && ix86_binary_operator_ok (ASHIFTRT, mode, operands, TARGET_APX_NDD)
+   && "
   "@

-   sar{}\t{%2, %0|%0, %2}
-   sar{}\t{%2, %1, %0|%0, %1, %2}"
+   sar{}\t{%2, %0|%0, %2}
+   sar{}\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "isa" "*,*,apx_ndd")
(set_attr "type" "imovx,ishift,ishift")
(set_attr "prefix_0f" "0,*,*")
@@ -17109,28 +17109,31 @@
   [(set_attr "type" "rotatex")
(set_attr "mode" "")])
 
-(define_insn "*3_1"
+(define_insn "*3_1"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r")
(any_rotate:SWI48
  (match_operand:SWI48 1 "nonimmediate_operand" "0,rm,rm")
- (match_operand:QI 2 "nonmemory_operand" "c,,c")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (, mode, operands, TARGET_APX_NDD)"
+ (match_operand:QI 2 "nonmemory_operand" "c,,c")))]
+  "ix86_binary_operator_ok (, mode, operands, TARGET_APX_NDD)
+   && "
 {
   bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
   switch (get_attr_type (insn))
 {
 case TYPE_ROTATEX:
-  return "#";
+  if (TARGET_APX_NDD && )
+   return "%{nf%} {}\t{%2, %1, %0|%0, %1, %2}";
+  else
+   return "#";
 
 default:
   if (operands[2] == const1_rtx
  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))
- && !use_ndd)
+ && !use_ndd && !)
return "{}\t%0";
   else
-   return use_ndd ? "{}\t{%2, %1, %0|%0, %1, %2}"
-  : "{}\t{%2, %0|%0, %2}";
+   return use_ndd ? "{}\t{%2, %1, %0|%0, 
%1, %2}"
+  : "{}\t{%2, %0|%0, %2}";
 }
 }
   [(set_attr "isa" "*,bmi2,apx_ndd")
@@ -17164,6 +17167,20 @@
   operands[2] = GEN_INT ((bitsize - INTVAL (operands[2])) % bitsize);
 })
 
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand")
+   (rotate:SWI48 (match_operand:SWI48 1 "nonimmediate_operand")
+ (match_operand:QI 2 "const_int_operand")))]
+  "TARGET_BMI2 && reload_completed && !optimize_function_for_size_p (cfun)
+   && !TARGET_APX_NDD"
+  [(set (match_dup 0)
+   (rotatert:SWI48 (match_dup 1) (match_dup 2)))]
+{
+  int bitsize = GET_MODE_BITSIZE (mode);
+
+  operands[2] = GEN_INT ((bitsize - INTVAL (operands[2])) % bitsize);
+})
+
 (define_split
   [(set (match_operand:SWI48 0 "register_operand")
(rotatert:SWI48 (match_operand:SWI48 1 "nonimmediate_operand")
@@ -17251,22 +17268,22 @@
   [(set (match_dup 0)
(zero_extend:DI (rotatert:SI (match_dup 1) (match_dup 2])
 
-(define_insn "*3_1"
+(define_insn "*3_1"
   [(set (match_operand:SWI12 0 "nonimmediate_operand" "=m,r")
(any_rotate:SWI12 (match_operand:SWI12 1 "nonimmediate_operand" "0,rm")
- (match_operand:QI 2 "nonmemory_operand" "c,c")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (, mode, operands, TARGET_APX_NDD)"
+ (match_operand:QI 2 "nonmemory_operand" 
"c,c")))]
+  "ix86_binary_operator_ok (, mode, operands, TARGET_APX_NDD)
+   && "
 {
   bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
   if (operands[2] == const1_rtx
   && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))
-  && !use_ndd)
+  && !use_ndd && !)
 return "{}\t%0";
   else
 return use_ndd
-  ? "{}\t{%2, %1, %0|%0, %1, %2}"
-  : "{}\t{%2, %0|%0, %2}";
+  ? "{}\t{%2, %1, %0|%0, %1, %2}"
+  : "{}\t{%2, %0|%0, %2}";
 }
   [(set_attr "isa" "*,apx_ndd")
(set_attr "type" "rotate")
diff --git a/gcc/testsuite/gcc.target/i386/apx-nf.c 
b/gcc/testsuite/gcc.target/i386/apx-nf.c
index f33a994f0b7..ed859b399b8 100644
--- a/gcc/testsuite/gcc.target/i386/apx-nf.c
+++ b/gcc/testsuite/gcc.target/i386/apx-nf.c
@@ -2,6 +2,7 @@
 /* { dg-options "-mapx-features=egpr,push2pop2,ndd,ppx,nf -march=x86-64 -O2" } 
*/
 /* { 

[PATCH v3 2/8] [APX NF] Support APX NF for {sub/and/or/xor/neg}

2024-05-28 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (nf_nonf_attr): New subst_attr.
(nf_nonf_x64_attr): Ditto.
(*sub_1_nf): New define_insn.
(*anddi_1_nf): Ditto.
(*and_1_nf): Ditto.
(*qi_1_nf): Ditto.
(*_1_nf): Ditto.
(*neg_1_nf): Ditto.
* config/i386/sse.md : New define_split.

gcc/testsuite/ChangeLog:

* gcc.target/i386/apx-nf.c: Add test.
---
 gcc/config/i386/i386.md| 173 +
 gcc/config/i386/sse.md |  11 ++
 gcc/testsuite/gcc.target/i386/apx-nf.c |  12 ++
 3 files changed, 114 insertions(+), 82 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/apx-nf.c

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 1eeadaddeba..d3cb224abad 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -575,7 +575,7 @@
noavx512dq,fma_or_avx512vl,avx512vl,noavx512vl,avxvnni,
avx512vnnivl,avx512fp16,avxifma,avx512ifmavl,avxneconvert,
avx512bf16vl,vpclmulqdqvl,avx_noavx512f,avx_noavx512vl,
-   vaes_avx512vl"
+   vaes_avx512vl,noapx_nf"
   (const_string "base"))
 
 ;; The (bounding maximum) length of an instruction immediate.
@@ -981,6 +981,7 @@
   (symbol_ref "TARGET_MMX_WITH_SSE && !TARGET_AVX")
 (eq_attr "mmx_isa" "avx")
   (symbol_ref "TARGET_MMX_WITH_SSE && TARGET_AVX")
+(eq_attr "isa" "noapx_nf") (symbol_ref "!TARGET_APX_NF")
]
(const_int 1)))
 
@@ -6449,6 +6450,8 @@
 (define_subst_attr "nf_condition" "nf_subst" "TARGET_APX_NF" "true")
 (define_subst_attr "nf_mem_constraint" "nf_subst" "je" "m")
 (define_subst_attr "nf_applied" "nf_subst" "true" "false")
+(define_subst_attr "nf_nonf_attr" "nf_subst"  "noapx_nf" "*")
+(define_subst_attr "nf_nonf_x64_attr" "nf_subst" "noapx_nf" "x64")
 
 (define_subst "nf_subst"
   [(set (match_operand:SWI 0)
@@ -7893,20 +7896,21 @@
   "split_double_mode (mode, [0], 2, [0], [3]);"
 [(set_attr "isa" "*,*,apx_ndd,apx_ndd")])
 
-(define_insn "*sub_1"
-  [(set (match_operand:SWI 0 "nonimmediate_operand" "=m,,r,r,r")
+(define_insn "*sub_1"
+  [(set (match_operand:SWI 0 "nonimmediate_operand" 
"=m,r,,r,r,r")
(minus:SWI
- (match_operand:SWI 1 "nonimmediate_operand" "0,0,rm,rjM,r")
- (match_operand:SWI 2 "" ",,r,,")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (MINUS, mode, operands, TARGET_APX_NDD)"
+ (match_operand:SWI 1 "nonimmediate_operand" "0,0,0,rm,rjM,r")
+ (match_operand:SWI 2 "" ",,,r,,")))]
+  "ix86_binary_operator_ok (MINUS, mode, operands, TARGET_APX_NDD)
+  && "
   "@
-  sub{}\t{%2, %0|%0, %2}
-  sub{}\t{%2, %0|%0, %2}
-  sub{}\t{%2, %1, %0|%0, %1, %2}
-  sub{}\t{%2, %1, %0|%0, %1, %2}
-  sub{}\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "isa" "*,*,apx_ndd,apx_ndd,apx_ndd")
+  sub{}\t{%2, %0|%0, %2}
+  sub{}\t{%2, %0|%0, %2}
+  sub{}\t{%2, %0|%0, %2}
+  sub{}\t{%2, %1, %0|%0, %1, %2}
+  sub{}\t{%2, %1, %0|%0, %1, %2}
+  sub{}\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "*,*,*,apx_ndd,apx_ndd,apx_ndd")
(set_attr "type" "alu")
(set_attr "mode" "")])
 
@@ -11795,27 +11799,28 @@
 }
 [(set_attr "isa" "*,*,apx_ndd,apx_ndd,apx_ndd,apx_ndd_64,apx_ndd")])
 
-(define_insn "*anddi_1"
-  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,r,rm,r,r,r,r,r,?k")
+(define_insn "*anddi_1"
+  [(set (match_operand:DI 0 "nonimmediate_operand" 
"=r,r,rm,r,r,r,r,r,r,?k")
(and:DI
-(match_operand:DI 1 "nonimmediate_operand" "%0,r,0,0,rm,rjM,r,qm,k")
-(match_operand:DI 2 "x86_64_szext_general_operand" 
"Z,Z,re,m,r,e,m,L,k")))
-   (clobber (reg:CC FLAGS_REG))]
+(match_operand:DI 1 "nonimmediate_operand" "%0,r,0,0,0,rm,rjM,r,qm,k")
+(match_operand:DI 2 "x86_64_szext_general_operand" 
"Z,Z,r,e,m,r,e,m,L,k")))]
   "TARGET_64BIT
-   && ix86_binary_operator_ok (AND, DImode, operands, TARGET_APX_NDD)"
+   && ix86_binary_operator_ok (AND, DImode, operands, TARGET_APX_NDD)
+   && "
   "@
-   and{l}\t{%k2, %k0|%k0, %k2}
-   and{l}\t{%k2, %k1, %k0|%k0, %k1, %k2}
-   and{q}\t{%2, %0|%0, %2}
-   and{q}\t{%2, %0|%0, %2}
-   and{q}\t{%2, %1, %0|%0, %1, %2}
-   and{q}\t{%2, %1, %0|%0, %1, %2}
-   and{q}\t{%2, %1, %0|%0, %1, %2}
+   and{l}\t{%k2, %k0|%k0, %k2}
+   and{l}\t{%k2, %k1, %k0|%k0, %k1, %k2}
+   and{q}\t{%2, %0|%0, %2}
+   and{q}\t{%2, %0|%0, %2}
+   and{q}\t{%2, %0|%0, %2}
+   and{q}\t{%2, %1, %0|%0, %1, %2}
+   and{q}\t{%2, %1, %0|%0, %1, %2}
+   and{q}\t{%2, %1, %0|%0, %1, %2}
#
#"
-  [(set_attr "isa" "x64,apx_ndd,x64,x64,apx_ndd,apx_ndd,apx_ndd,x64,avx512bw")
-   (set_attr "type" "alu,alu,alu,alu,alu,alu,alu,imovx,msklog")
-   (set_attr "length_immediate" "*,*,*,*,*,*,*,0,*")
+  [(set_attr "isa" 
"x64,apx_ndd,x64,x64,x64,apx_ndd,apx_ndd,apx_ndd,,avx512bw")
+   (set_attr "type" "alu,alu,alu,alu,alu,alu,alu,alu,imovx,msklog")
+   (set_attr "length_immediate" "*,*,*,*,*,*,*,*,0,*")
(set (attr 

[PATCH v3 3/8] [APX NF] Support APX NF for left shift insns

2024-05-28 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (*ashl3_1_nf): New.
(*ashlhi3_1_nf): Ditto.
(*ashlqi3_1_nf): Ditto.
* config/i386/sse.md: New define_split.
---
 gcc/config/i386/i386.md | 96 ++---
 gcc/config/i386/sse.md  | 13 ++
 2 files changed, 83 insertions(+), 26 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index d3cb224abad..4c06c243cc3 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -15011,17 +15011,22 @@
   [(set_attr "type" "ishiftx")
(set_attr "mode" "")])
 
-(define_insn "*ashl3_1"
+(define_insn "*ashl3_1"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r,?k,r")
(ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" 
"0,l,rm,k,rm")
- (match_operand:QI 2 "nonmemory_operand" 
"c,M,r,,c")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (ASHIFT, mode, operands, TARGET_APX_NDD)"
+ (match_operand:QI 2 "nonmemory_operand" 
"c,M,r,,c")))]
+  "ix86_binary_operator_ok (ASHIFT, mode, operands, TARGET_APX_NDD)
+   && "
 {
   bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
   switch (get_attr_type (insn))
 {
 case TYPE_LEA:
+  if (TARGET_APX_NDD && )
+   return "%{nf%} sal{}\t{%2, %1, %0|%0, %1, %2}";
+  else
+   return "#";
+
 case TYPE_ISHIFTX:
 case TYPE_MSKLOG:
   return "#";
@@ -15029,7 +15034,7 @@
 case TYPE_ALU:
   gcc_assert (operands[2] == const1_rtx);
   gcc_assert (rtx_equal_p (operands[0], operands[1]));
-  return "add{}\t%0, %0";
+  return "add{}\t%0, %0";
 
 default:
   if (operands[2] == const1_rtx
@@ -15037,11 +15042,11 @@
  /* For NDD form instructions related to TARGET_SHIFT1, the $1
 immediate do not need to be omitted as assembler will map it
 to use shorter encoding. */
- && !use_ndd)
+ && !use_ndd && !)
return "sal{}\t%0";
   else
-   return use_ndd ? "sal{}\t{%2, %1, %0|%0, %1, %2}"
-  : "sal{}\t{%2, %0|%0, %2}";
+   return use_ndd ? "sal{}\t{%2, %1, %0|%0, %1, 
%2}"
+  : "sal{}\t{%2, %0|%0, %2}";
 }
 }
   [(set_attr "isa" "*,*,bmi2,avx512bw,apx_ndd")
@@ -15072,6 +15077,17 @@
(set_attr "mode" "")])
 
 ;; Convert shift to the shiftx pattern to avoid flags dependency.
+;; For NF/NDD doesn't support shift count as r, it just support c,
+;; and it has no flag.
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand")
+   (ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand")
+ (match_operand:QI 2 "register_operand")))]
+  "TARGET_BMI2 && reload_completed"
+  [(set (match_dup 0)
+   (ashift:SWI48 (match_dup 1) (match_dup 2)))]
+  "operands[2] = gen_lowpart (mode, operands[2]);")
+
 (define_split
   [(set (match_operand:SWI48 0 "register_operand")
(ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand")
@@ -15158,32 +15174,37 @@
(zero_extend:DI (ashift:SI (match_dup 1) (match_dup 2]
   "operands[2] = gen_lowpart (SImode, operands[2]);")
 
-(define_insn "*ashlhi3_1"
+(define_insn "*ashlhi3_1"
   [(set (match_operand:HI 0 "nonimmediate_operand" "=rm,Yp,?k,r")
(ashift:HI (match_operand:HI 1 "nonimmediate_operand" "0,l,k,rm")
-  (match_operand:QI 2 "nonmemory_operand" "cI,M,Ww,cI")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (ASHIFT, HImode, operands, TARGET_APX_NDD)"
+  (match_operand:QI 2 "nonmemory_operand" "cI,M,Ww,cI")))]
+  "ix86_binary_operator_ok (ASHIFT, HImode, operands, TARGET_APX_NDD)
+   && "
 {
   bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
   switch (get_attr_type (insn))
 {
 case TYPE_LEA:
+  if (TARGET_APX_NDD && )
+   return "%{nf%} sal{w}\t{%2, %1, %0|%0, %1, %2}";
+  else
+   return "#";
+
 case TYPE_MSKLOG:
   return "#";
 
 case TYPE_ALU:
   gcc_assert (operands[2] == const1_rtx);
-  return "add{w}\t%0, %0";
+  return "add{w}\t%0, %0";
 
 default:
   if (operands[2] == const1_rtx
  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))
- && !use_ndd)
+ && !use_ndd && !)
return "sal{w}\t%0";
   else
-   return use_ndd ? "sal{w}\t{%2, %1, %0|%0, %1, %2}"
-  : "sal{w}\t{%2, %0|%0, %2}";
+   return use_ndd ? "sal{w}\t{%2, %1, %0|%0, %1, %2}"
+  : "sal{w}\t{%2, %0|%0, %2}";
 }
 }
   [(set_attr "isa" "*,*,avx512f,apx_ndd")
@@ -15211,31 +15232,36 @@
(const_string "*")))
(set_attr "mode" "HI,SI,HI,HI")])
 
-(define_insn "*ashlqi3_1"
+(define_insn "*ashlqi3_1"
   [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,Yp,?k,r")
(ashift:QI (match_operand:QI 1 "nonimmediate_operand" "0,0,l,k,rm")
-  (match_operand:QI 2 "nonmemory_operand" "cI,cI,M,Wb,cI")))
-   (clobber (reg:CC FLAGS_REG))]
-  

[PATCH v3 1/8] [APX NF]: Support APX NF add

2024-05-28 Thread Kong, Lingling
Hi, compared with v2, these patches restored the original lea patten position 
and addressed hongtao's comment. 

APX NF(no flags) feature implements suppresses the update of status flags
for arithmetic operations.

For NF add, it is not clear whether nf add can be faster than lea. If so,
the pattern needs to be adjusted to perfer lea generation.

gcc/ChangeLog:

* config/i386/i386-opts.h (enum apx_features): Add nf
enumeration.
* config/i386/i386.h (TARGET_APX_NF): New.
* config/i386/i386.md (*add_1_nf): New define_insn.
* config/i386/i386.opt: Add apx_nf enumeration.

gcc/testsuite/ChangeLog:

* gcc.target/i386/apx-ndd.c: Fixed test.

Co-authored-by: Lingling Kong 
---
 gcc/config/i386/i386-opts.h |   3 +-
 gcc/config/i386/i386.h  |   1 +
 gcc/config/i386/i386.md | 135 
 gcc/config/i386/i386.opt|   3 +
 gcc/testsuite/gcc.target/i386/apx-ndd.c |   2 +-
 5 files changed, 98 insertions(+), 46 deletions(-)

diff --git a/gcc/config/i386/i386-opts.h b/gcc/config/i386/i386-opts.h
index ef2825803b3..60176ce609f 100644
--- a/gcc/config/i386/i386-opts.h
+++ b/gcc/config/i386/i386-opts.h
@@ -140,7 +140,8 @@ enum apx_features {
   apx_push2pop2 = 1 << 1,
   apx_ndd = 1 << 2,
   apx_ppx = 1 << 3,
-  apx_all = apx_egpr | apx_push2pop2 | apx_ndd | apx_ppx,
+  apx_nf = 1<< 4,
+  apx_all = apx_egpr | apx_push2pop2 | apx_ndd | apx_ppx | apx_nf,
 };
 
 #endif
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 359a8408263..969391d3013 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -55,6 +55,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If 
not, see
 #define TARGET_APX_PUSH2POP2 (ix86_apx_features & apx_push2pop2)
 #define TARGET_APX_NDD (ix86_apx_features & apx_ndd)
 #define TARGET_APX_PPX (ix86_apx_features & apx_ppx)
+#define TARGET_APX_NF (ix86_apx_features & apx_nf)
 
 #include "config/vxworks-dummy.h"
 
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index e8073f5a200..1eeadaddeba 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -6290,6 +6290,13 @@
   [(parallel [(set (match_dup 0) (ashift:SWI48 (match_dup 0) (match_dup 1)))
   (clobber (reg:CC FLAGS_REG))])]
   "operands[1] = GEN_INT (exact_log2 (INTVAL (operands[1])));")
+
+(define_split
+  [(set (match_operand:SWI48 0 "general_reg_operand")
+   (mult:SWI48 (match_dup 0) (match_operand:SWI48 1 "const1248_operand")))]
+  "TARGET_APX_NF && reload_completed"
+  [(set (match_dup 0) (ashift:SWI48 (match_dup 0) (match_dup 1)))]
+  "operands[1] = GEN_INT (exact_log2 (INTVAL (operands[1])));")
 

 ;; Add instructions
 
@@ -6437,48 +6444,65 @@
  (clobber (reg:CC FLAGS_REG))])]
  "split_double_mode (mode, [0], 1, [0], [5]);")
 
-(define_insn "*add_1"
-  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r,r,r,r,r")
+(define_subst_attr "nf_name" "nf_subst" "_nf" "")
+(define_subst_attr "nf_prefix" "nf_subst" "%{nf%} " "")
+(define_subst_attr "nf_condition" "nf_subst" "TARGET_APX_NF" "true")
+(define_subst_attr "nf_mem_constraint" "nf_subst" "je" "m")
+(define_subst_attr "nf_applied" "nf_subst" "true" "false")
+
+(define_subst "nf_subst"
+  [(set (match_operand:SWI 0)
+(match_operand:SWI 1))]
+  ""
+  [(set (match_dup 0)
+   (match_dup 1))
+   (clobber (reg:CC FLAGS_REG))])
+
+(define_insn "*add_1"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" 
"=rm,r,r,r,r,r,r,r")
(plus:SWI48
- (match_operand:SWI48 1 "nonimmediate_operand" "%0,0,r,r,rje,jM,r")
- (match_operand:SWI48 2 "x86_64_general_operand" "re,BM,0,le,r,e,BM")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (PLUS, mode, operands, TARGET_APX_NDD)"
+ (match_operand:SWI48 1 "nonimmediate_operand" "%0,0,0,r,r,rje,jM,r")
+ (match_operand:SWI48 2 "x86_64_general_operand" 
"r,e,BM,0,le,r,e,BM")))]
+  "ix86_binary_operator_ok (PLUS, mode, operands, TARGET_APX_NDD)
+  && "
 {
   bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
   switch (get_attr_type (insn))
 {
 case TYPE_LEA:
-  return "#";
+  if (TARGET_APX_NDD && )
+   return "%{nf%} add{}\t{%2, %1, %0|%0, %1, %2}";
+  else
+   return "#";
 
 case TYPE_INCDEC:
   if (operands[2] == const1_rtx)
-return use_ndd ? "inc{}\t{%1, %0|%0, %1}"
- : "inc{}\t%0";
+return use_ndd ? "inc{}\t{%1, %0|%0, %1}"
+ : "inc{}\t%0";
   else
 {
  gcc_assert (operands[2] == constm1_rtx);
- return use_ndd ? "dec{}\t{%1, %0|%0, %1}"
-   : "dec{}\t%0";
+ return use_ndd ? "dec{}\t{%1, %0|%0, %1}"
+   : "dec{}\t%0";
}
 
 default:
   /* For most processors, ADD is faster than LEA.  This alternative
 was added to use ADD as much as possible.  */
-  if (which_alternative == 2)
+  

RE: [PATCH v2 2/8] [APX NF] Support APX NF for {sub/and/or/xor/neg}

2024-05-22 Thread Kong, Lingling
Cc Uros.

From: Kong, Lingling 
Sent: Wednesday, May 22, 2024 4:35 PM
To: gcc-patches@gcc.gnu.org
Cc: Liu, Hongtao ; Kong, Lingling 

Subject: [PATCH v2 2/8] [APX NF] Support APX NF for {sub/and/or/xor/neg}

gcc/ChangeLog:

   * config/i386/i386.md (nf_and_applied): New subst_attr.
   (nf_x64_and_applied): Ditto.
   (*sub_1_nf): New define_insn.
   (*anddi_1_nf): Ditto.
   (*and_1_nf): Ditto.
   (*qi_1_nf): Ditto.
   (*_1_nf): Ditto.
   (*neg_1_nf): Ditto.
   * config/i386/sse.md : New define_split.

gcc/testsuite/ChangeLog:

   * gcc.target/i386/apx-nf.c: Add test.
---
gcc/config/i386/i386.md| 174 +
gcc/config/i386/sse.md |  11 ++
gcc/testsuite/gcc.target/i386/apx-nf.c |   9 ++
3 files changed, 112 insertions(+), 82 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index bae344518bd..099d7f35c8f 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -575,7 +575,7 @@
 
noavx512dq,fma_or_avx512vl,avx512vl,noavx512vl,avxvnni,
 
avx512vnnivl,avx512fp16,avxifma,avx512ifmavl,avxneconvert,
 
avx512bf16vl,vpclmulqdqvl,avx_noavx512f,avx_noavx512vl,
- vaes_avx512vl"
+vaes_avx512vl,noapx_nf"
   (const_string "base"))

 ;; The (bounding maximum) length of an instruction immediate.
@@ -981,6 +981,7 @@
 (symbol_ref "TARGET_MMX_WITH_SSE && !TARGET_AVX")
   (eq_attr "mmx_isa" "avx")
 (symbol_ref "TARGET_MMX_WITH_SSE && TARGET_AVX")
+ (eq_attr "isa" "noapx_nf") (symbol_ref "!TARGET_APX_NF")
  ]
  (const_int 1)))

@@ -7893,20 +7894,21 @@
   "split_double_mode (mode, [0], 2, [0], [3]);"
[(set_attr "isa" "*,*,apx_ndd,apx_ndd")])

-(define_insn "*sub_1"
-  [(set (match_operand:SWI 0 "nonimmediate_operand" "=m,,r,r,r")
+(define_insn "*sub_1"
+  [(set (match_operand:SWI 0 "nonimmediate_operand" 
"=m,r,,r,r,r")
  (minus:SWI
-(match_operand:SWI 1 "nonimmediate_operand" "0,0,rm,rjM,r")
-(match_operand:SWI 2 "" 
",,r,,")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (MINUS, mode, operands, TARGET_APX_NDD)"
+   (match_operand:SWI 1 "nonimmediate_operand" "0,0,0,rm,rjM,r")
+   (match_operand:SWI 2 "" 
",,,r,,")))]
+  "ix86_binary_operator_ok (MINUS, mode, operands, TARGET_APX_NDD)
+  && "
   "@
-  sub{}\t{%2, %0|%0, %2}
-  sub{}\t{%2, %0|%0, %2}
-  sub{}\t{%2, %1, %0|%0, %1, %2}
-  sub{}\t{%2, %1, %0|%0, %1, %2}
-  sub{}\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "isa" "*,*,apx_ndd,apx_ndd,apx_ndd")
+  sub{}\t{%2, %0|%0, %2}
+  sub{}\t{%2, %0|%0, %2}
+  sub{}\t{%2, %0|%0, %2}
+  sub{}\t{%2, %1, %0|%0, %1, %2}
+  sub{}\t{%2, %1, %0|%0, %1, %2}
+  sub{}\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "*,*,*,apx_ndd,apx_ndd,apx_ndd")
(set_attr "type" "alu")
(set_attr "mode" "")])

@@ -11795,27 +11797,31 @@
}
[(set_attr "isa" "*,*,apx_ndd,apx_ndd,apx_ndd,apx_ndd_64,apx_ndd")])

-(define_insn "*anddi_1"
-  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,r,rm,r,r,r,r,r,?k")
+(define_subst_attr "nf_and_applied" "nf_subst"  "noapx_nf" "*")
+(define_subst_attr "nf_x64_and_applied" "nf_subst" "noapx_nf" "x64")
+
+(define_insn "*anddi_1"
+  [(set (match_operand:DI 0 "nonimmediate_operand" 
"=r,r,rm,r,r,r,r,r,r,?k")
  (and:DI
-  (match_operand:DI 1 "nonimmediate_operand" 
"%0,r,0,0,rm,rjM,r,qm,k")
-  (match_operand:DI 2 "x86_64_szext_general_operand" 
"Z,Z,re,m,r,e,m,L,k")))
-   (clobber (reg:CC FLAGS_REG))]
+ (match_operand:DI 1 "nonimmediate_operand" 
"%0,r,0,0,0,rm,rjM,r,qm,k")
+ (match_operand:DI 2 "x86_64_szext_general_operand" 
"Z,Z,r,e,m,r,e,m,L,k")))]
   "TARGET_64BIT
-   && ix86_binary_operator_ok (AND, DImode, operands, TARGET_APX_NDD)"
+   && ix86_binary_operator_ok (AND, DImode, operands, TARGET_APX_NDD)
+   && "
   "@
-   and{l}\t{%k2, %k0|%k0, %k2}
-   and{l}\t{%k2, %k1, %k0|%k0, %k1, %k2}
-   and{q}\t{%2, %0|%0, %2}
-   and{q}\t{%2, %0|%0, %2}
-   and{q}\t{%

[PATCH v2 8/8] [APX NF] Support APX NF for lzcnt/tzcnt/popcnt

2024-05-22 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (clz2_lzcnt_nf): New define_insn.
(*clz2_lzcnt_falsedep_nf): Ditto.
(__nf): Ditto.
(*__falsedep_nf): Ditto.
(_hi_nf): Ditto.
(popcount2_nf): Ditto.
(*popcount2_falsedep_nf): Ditto.
(popcounthi2_nf): Ditto.
---
 gcc/config/i386/i386.md | 124 
 1 file changed, 113 insertions(+), 11 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
087761e5b3a..c9a3a99ca70 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -20250,6 +20250,24 @@
   operands[3] = gen_reg_rtx (mode);
 })
 
+(define_insn_and_split "clz2_lzcnt_nf"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+   (clz:SWI48
+ (match_operand:SWI48 1 "nonimmediate_operand" "rm")))]
+  "TARGET_APX_NF && TARGET_LZCNT"
+  "%{nf%} lzcnt{}\t{%1, %0|%0, %1}"
+  "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed
+   && optimize_function_for_speed_p (cfun)
+   && !reg_mentioned_p (operands[0], operands[1])"
+  [(parallel
+[(set (match_dup 0)
+ (clz:SWI48 (match_dup 1)))
+ (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "ix86_expand_clear (operands[0]);"
+  [(set_attr "prefix_rep" "1")
+   (set_attr "type" "bitmanip")
+   (set_attr "mode" "")])
+
 (define_insn_and_split "clz2_lzcnt"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
(clz:SWI48
@@ -20273,6 +20291,18 @@
 ; False dependency happens when destination is only updated by tzcnt,  ; lzcnt 
or popcnt.  There is no false dependency when destination is  ; also used in 
source.
+(define_insn "*clz2_lzcnt_falsedep_nf"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+   (clz:SWI48
+ (match_operand:SWI48 1 "nonimmediate_operand" "rm")))
+   (unspec [(match_operand:SWI48 2 "register_operand" "0")]
+  UNSPEC_INSN_FALSE_DEP)]
+  "TARGET_APX_NF && TARGET_LZCNT"
+  "%{nf%} lzcnt{}\t{%1, %0|%0, %1}"
+  [(set_attr "prefix_rep" "1")
+   (set_attr "type" "bitmanip")
+   (set_attr "mode" "")])
+
 (define_insn "*clz2_lzcnt_falsedep"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
(clz:SWI48
@@ -20379,6 +20409,25 @@
 ;; Version of lzcnt/tzcnt that is expanded from intrinsics.  This version  ;; 
provides operand size as output when source operand is zero. 
 
+(define_insn_and_split "__nf"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+   (unspec:SWI48
+ [(match_operand:SWI48 1 "nonimmediate_operand" "rm")] LT_ZCNT))]
+  "TARGET_APX_NF"
+  "%{nf%} {}\t{%1, %0|%0, %1}"
+  "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed
+   && optimize_function_for_speed_p (cfun)
+   && !reg_mentioned_p (operands[0], operands[1])"
+  [(parallel
+[(set (match_dup 0)
+ (unspec:SWI48 [(match_dup 1)] LT_ZCNT))
+ (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "ix86_expand_clear (operands[0]);"
+  [(set_attr "type" "")
+   (set_attr "prefix_0f" "1")
+   (set_attr "prefix_rep" "1")
+   (set_attr "mode" "")])
+
 (define_insn_and_split "_"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
(unspec:SWI48
@@ -20403,6 +20452,19 @@
 ; False dependency happens when destination is only updated by tzcnt,  ; lzcnt 
or popcnt.  There is no false dependency when destination is  ; also used in 
source.
+(define_insn "*__falsedep_nf"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+   (unspec:SWI48
+ [(match_operand:SWI48 1 "nonimmediate_operand" "rm")] LT_ZCNT))
+   (unspec [(match_operand:SWI48 2 "register_operand" "0")]
+  UNSPEC_INSN_FALSE_DEP)]
+  "TARGET_APX_NF"
+  "%{nf%} {}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "")
+   (set_attr "prefix_0f" "1")
+   (set_attr "prefix_rep" "1")
+   (set_attr "mode" "")])
+
 (define_insn "*__falsedep"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
(unspec:SWI48
@@ -20417,13 +20479,12 @@
(set_attr "prefix_rep" "1")
(set_attr "mode" "")])
 
-(define_insn "_hi"
+(define_insn "_hi"
   [(set (match_operand:HI 0 "register_operand" "=r")
(unspec:HI
- [(match_operand:HI 1 "nonimmediate_operand" "rm")] LT_ZCNT))
-   (clobber (reg:CC FLAGS_REG))]
-  ""
-  "{w}\t{%1, %0|%0, %1}"
+ [(match_operand:HI 1 "nonimmediate_operand" "rm")] LT_ZCNT))]
+  ""
+  "{w}\t{%1, %0|%0, %1}"
   [(set_attr "type" "")
(set_attr "prefix_0f" "1")
(set_attr "prefix_rep" "1")
@@ -20841,6 +20902,30 @@
   [(set_attr "type" "bitmanip")
(set_attr "mode" "")])
 
+(define_insn_and_split "popcount2_nf"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+   (popcount:SWI48
+ (match_operand:SWI48 1 "nonimmediate_operand" "rm")))]
+  "TARGET_APX_NF && TARGET_POPCNT"
+{
+#if TARGET_MACHO
+  return "%{nf%} popcnt\t{%1, %0|%0, %1}"; #else
+  return "%{nf%} popcnt{}\t{%1, %0|%0, %1}"; #endif }
+  "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed
+   && optimize_function_for_speed_p (cfun)
+   && 

[PATCH v2 7/8] [APX NF] Support APX NF for mul/div

2024-05-22 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (*mul3_1_nf): New define_insn.
(*mulqi3_1_nf): Ditto.
(*divmod4_noext_nf): Ditto.
(divmodhiqi3_nf): Ditto.
---
 gcc/config/i386/i386.md | 47 ++---
 1 file changed, 30 insertions(+), 17 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
4d684e8d919..087761e5b3a 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -9896,17 +9896,17 @@
 ;;
 ;; On BDVER1, all HI MULs use DoublePath
 
-(define_insn "*mul3_1"
+(define_insn "*mul3_1"
   [(set (match_operand:SWIM248 0 "register_operand" "=r,r,r")
(mult:SWIM248
  (match_operand:SWIM248 1 "nonimmediate_operand" "%rm,rm,0")
- (match_operand:SWIM248 2 "" "K,,r")))
-   (clobber (reg:CC FLAGS_REG))]
-  "!(MEM_P (operands[1]) && MEM_P (operands[2]))"
+ (match_operand:SWIM248 2 "" "K,,r")))]
+  "!(MEM_P (operands[1]) && MEM_P (operands[2]))
+   && "
   "@
-   imul{}\t{%2, %1, %0|%0, %1, %2}
-   imul{}\t{%2, %1, %0|%0, %1, %2}
-   imul{}\t{%2, %0|%0, %2}"
+   imul{}\t{%2, %1, %0|%0, %1, %2}
+   imul{}\t{%2, %1, %0|%0, %1, %2}
+   imul{}\t{%2, %0|%0, %2}"
   [(set_attr "type" "imul")
(set_attr "prefix_0f" "0,0,1")
(set (attr "athlon_decode")
@@ -9967,14 +9967,14 @@
 ;; MUL reg8Direct
 ;; MUL mem8Direct
 
-(define_insn "*mulqi3_1"
+(define_insn "*mulqi3_1"
   [(set (match_operand:QI 0 "register_operand" "=a")
(mult:QI (match_operand:QI 1 "nonimmediate_operand" "%0")
-(match_operand:QI 2 "nonimmediate_operand" "qm")))
-   (clobber (reg:CC FLAGS_REG))]
+(match_operand:QI 2 "nonimmediate_operand" "qm")))]
   "TARGET_QIMODE_MATH
-   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
-  "mul{b}\t%2"
+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))
+   && "
+  "mul{b}\t%2"
   [(set_attr "type" "imul")
(set_attr "length_immediate" "0")
(set (attr "athlon_decode")
@@ -7,6 +7,19 @@
   [(set_attr "type" "multi")
(set_attr "mode" "SI")])
 
+(define_insn "*divmod4_noext_nf"
+  [(set (match_operand:SWIM248 0 "register_operand" "=a")
+   (any_div:SWIM248
+ (match_operand:SWIM248 2 "register_operand" "0")
+ (match_operand:SWIM248 3 "nonimmediate_operand" "rm")))
+   (set (match_operand:SWIM248 1 "register_operand" "=d")
+   (:SWIM248 (match_dup 2) (match_dup 3)))
+   (use (match_operand:SWIM248 4 "register_operand" "1"))]
+  "TARGET_APX_NF"
+  "%{nf%} div{}\t%3"
+  [(set_attr "type" "idiv")
+   (set_attr "mode" "")])
+
 (define_insn "*divmod4_noext"
   [(set (match_operand:SWIM248 0 "register_operand" "=a")
(any_div:SWIM248
@@ -11264,7 +11277,7 @@
 ;; Change div/mod to HImode and extend the second argument to HImode  ;; so 
that mode of div/mod matches with mode of arguments.  Otherwise  ;; combine may 
fail.
-(define_insn "divmodhiqi3"
+(define_insn "divmodhiqi3"
   [(set (match_operand:HI 0 "register_operand" "=a")
(ior:HI
  (ashift:HI
@@ -11276,10 +11289,10 @@
(const_int 8))
  (zero_extend:HI
(truncate:QI
- (div:HI (match_dup 1) (any_extend:HI (match_dup 2)))
-   (clobber (reg:CC FLAGS_REG))]
-  "TARGET_QIMODE_MATH"
-  "div{b}\t%2"
+ (div:HI (match_dup 1) (any_extend:HI (match_dup 2)))]
+  "TARGET_QIMODE_MATH
+   && "
+  "div{b}\t%2"
   [(set_attr "type" "idiv")
(set_attr "mode" "QI")])
 
--
2.31.1



[PATCH v2 6/8] [APX NF] Support APX NF for shld/shrd

2024-05-22 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (x86_64_shld_nf): New define_insn.
(x86_64_shld_ndd_nf): Ditto.
(x86_64_shld_1_nf): Ditto.
(x86_64_shld_ndd_1_nf): Ditto.
(*x86_64_shld_shrd_1_nozext_nf): Ditto.
(x86_shld_nf): Ditto.
(x86_shld_ndd_nf): Ditto.
(x86_shld_1_nf): Ditto.
(x86_shld_ndd_1_nf): Ditto.
(*x86_shld_shrd_1_nozext_nf): Ditto.
(3_doubleword_lowpart_nf): Ditto.
(x86_64_shrd_nf): Ditto.
(x86_64_shrd_ndd_nf): Ditto.
(x86_64_shrd_1_nf): Ditto.
(x86_64_shrd_ndd_1_nf): Ditto.
(*x86_64_shrd_shld_1_nozext_nf): Ditto.
(x86_shrd_nf): Ditto.
(x86_shrd_ndd_nf): Ditto.
(x86_shrd_1_nf): Ditto.
(x86_shrd_ndd_1_nf): Ditto.
(*x86_shrd_shld_1_nozext_nf): Ditto.
---
 gcc/config/i386/i386.md | 377 +++-
 1 file changed, 296 insertions(+), 81 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
731eb12d13a..4d684e8d919 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -14552,7 +14552,7 @@
   DONE;
 })
 
-(define_insn "x86_64_shld"
+(define_insn "x86_64_shld"
   [(set (match_operand:DI 0 "nonimmediate_operand" "+r*m")
 (ior:DI (ashift:DI (match_dup 0)
  (and:QI (match_operand:QI 2 "nonmemory_operand" "Jc") @@ 
-14562,10 +14562,9 @@
(zero_extend:TI
  (match_operand:DI 1 "register_operand" "r"))
(minus:QI (const_int 64)
- (and:QI (match_dup 2) (const_int 63 0)))
-   (clobber (reg:CC FLAGS_REG))]
-  "TARGET_64BIT"
-  "shld{q}\t{%2, %1, %0|%0, %1, %2}"
+ (and:QI (match_dup 2) (const_int 63 0)))]
+  "TARGET_64BIT && "
+  "shld{q}\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "type" "ishift")
(set_attr "prefix_0f" "1")
(set_attr "mode" "DI")
@@ -14573,7 +14572,7 @@
(set_attr "amdfam10_decode" "vector")
(set_attr "bdver1_decode" "vector")])
 
-(define_insn "x86_64_shld_ndd"
+(define_insn "x86_64_shld_ndd"
   [(set (match_operand:DI 0 "register_operand" "=r")
 (ior:DI (ashift:DI (match_operand:DI 1 "nonimmediate_operand" "rm")
  (and:QI (match_operand:QI 3 "nonmemory_operand" "Jc") @@ 
-14583,14 +14582,13 @@
(zero_extend:TI
  (match_operand:DI 2 "register_operand" "r"))
(minus:QI (const_int 64)
- (and:QI (match_dup 3) (const_int 63 0)))
-   (clobber (reg:CC FLAGS_REG))]
-  "TARGET_APX_NDD"
-  "shld{q}\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ (and:QI (match_dup 3) (const_int 63 0)))]
+  "TARGET_APX_NDD && "
+  "shld{q}\t{%3, %2, %1, %0|%0, %1, %2, %3}"
   [(set_attr "type" "ishift")
(set_attr "mode" "DI")])
 
-(define_insn "x86_64_shld_1"
+(define_insn "x86_64_shld_1"
   [(set (match_operand:DI 0 "nonimmediate_operand" "+r*m")
 (ior:DI (ashift:DI (match_dup 0)
   (match_operand:QI 2 "const_0_to_63_operand")) @@ 
-14598,11 +14596,11 @@
  (lshiftrt:TI
(zero_extend:TI
  (match_operand:DI 1 "register_operand" "r"))
-   (match_operand:QI 3 "const_0_to_255_operand")) 0)))
-   (clobber (reg:CC FLAGS_REG))]
+   (match_operand:QI 3 "const_0_to_255_operand")) 0)))]
   "TARGET_64BIT
-   && INTVAL (operands[3]) == 64 - INTVAL (operands[2])"
-  "shld{q}\t{%2, %1, %0|%0, %1, %2}"
+   && INTVAL (operands[3]) == 64 - INTVAL (operands[2])
+   && "
+  "shld{q}\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "type" "ishift")
(set_attr "prefix_0f" "1")
(set_attr "mode" "DI")
@@ -14611,7 +14609,7 @@
(set_attr "amdfam10_decode" "vector")
(set_attr "bdver1_decode" "vector")])
 
-(define_insn "x86_64_shld_ndd_1"
+(define_insn "x86_64_shld_ndd_1"
   [(set (match_operand:DI 0 "register_operand" "=r")
 (ior:DI (ashift:DI (match_operand:DI 1 "nonimmediate_operand" "rm")
   (match_operand:QI 3 "const_0_to_63_operand")) @@ 
-14619,15 +14617,66 @@
  (lshiftrt:TI
(zero_extend:TI
  (match_operand:DI 2 "register_operand" "r"))
-   (match_operand:QI 4 "const_0_to_255_operand")) 0)))
-   (clobber (reg:CC FLAGS_REG))]
+   (match_operand:QI 4 "const_0_to_255_operand")) 0)))]
   "TARGET_APX_NDD
-   && INTVAL (operands[4]) == 64 - INTVAL (operands[3])"
-  "shld{q}\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+   && INTVAL (operands[4]) == 64 - INTVAL (operands[3])
+   && "
+  "shld{q}\t{%3, %2, %1, %0|%0, %1, %2, %3}"
   [(set_attr "type" "ishift")
(set_attr "mode" "DI")
(set_attr "length_immediate" "1")])
 
+(define_insn_and_split "*x86_64_shld_shrd_1_nozext_nf"
+  [(set (match_operand:DI 0 "nonimmediate_operand")
+   (ior:DI (ashift:DI (match_operand:DI 4 

[PATCH v2 5/8] [APX NF] Support APX NF for rotate insns

2024-05-22 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (ashr3_cvt_nf): New define_insn.
(*3_1_nf): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/apx-nf.c: Add NF test for rotate insns.
---
 gcc/config/i386/i386.md| 53 --
 gcc/testsuite/gcc.target/i386/apx-nf.c |  5 +++
 2 files changed, 38 insertions(+), 20 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
7f191749342..731eb12d13a 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -16230,19 +16230,19 @@
 (define_mode_attr cvt_mnemonic
   [(SI "{cltd|cdq}") (DI "{cqto|cqo}")])
 
-(define_insn "ashr3_cvt"
+(define_insn "ashr3_cvt"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=*d,rm,r")
(ashiftrt:SWI48
  (match_operand:SWI48 1 "nonimmediate_operand" "*a,0,rm")
- (match_operand:QI 2 "const_int_operand")))
-   (clobber (reg:CC FLAGS_REG))]
+ (match_operand:QI 2 "const_int_operand")))]
   "INTVAL (operands[2]) == GET_MODE_BITSIZE (mode)-1
&& (TARGET_USE_CLTD || optimize_function_for_size_p (cfun))
-   && ix86_binary_operator_ok (ASHIFTRT, mode, operands, TARGET_APX_NDD)"
+   && ix86_binary_operator_ok (ASHIFTRT, mode, operands, TARGET_APX_NDD)
+   && "
   "@

-   sar{}\t{%2, %0|%0, %2}
-   sar{}\t{%2, %1, %0|%0, %1, %2}"
+   sar{}\t{%2, %0|%0, %2}
+   sar{}\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "isa" "*,*,apx_ndd")
(set_attr "type" "imovx,ishift,ishift")
(set_attr "prefix_0f" "0,*,*")
@@ -17094,13 +17094,13 @@
   [(set_attr "type" "rotatex")
(set_attr "mode" "")])
 
-(define_insn "*3_1"
+(define_insn "*3_1"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r")
(any_rotate:SWI48
  (match_operand:SWI48 1 "nonimmediate_operand" "0,rm,rm")
- (match_operand:QI 2 "nonmemory_operand" "c,,c")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (, mode, operands, TARGET_APX_NDD)"
+ (match_operand:QI 2 "nonmemory_operand" "c,,c")))]
+  "ix86_binary_operator_ok (, mode, operands, TARGET_APX_NDD)
+   && "
 {
   bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
   switch (get_attr_type (insn))
@@ -17111,11 +17111,11 @@
 default:
   if (operands[2] == const1_rtx
  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))
- && !use_ndd)
+ && !use_ndd && !)
return "{}\t%0";
   else
-   return use_ndd ? "{}\t{%2, %1, %0|%0, %1, %2}"
-  : "{}\t{%2, %0|%0, %2}";
+   return use_ndd ? "{}\t{%2, %1, %0|%0, 
%1, %2}"
+  : "{}\t{%2, %0|%0, %2}";
 }
 }
   [(set_attr "isa" "*,bmi2,apx_ndd")
@@ -17135,6 +17135,19 @@
(set_attr "mode" "")])
 
 ;; Convert rotate to the rotatex pattern to avoid flags dependency.
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand")
+   (rotate:SWI48 (match_operand:SWI48 1 "nonimmediate_operand")
+ (match_operand:QI 2 "const_int_operand")))]
+  "TARGET_BMI2 && reload_completed && !optimize_function_for_size_p (cfun)"
+  [(set (match_dup 0)
+   (rotatert:SWI48 (match_dup 1) (match_dup 2)))] {
+  int bitsize = GET_MODE_BITSIZE (mode);
+
+  operands[2] = GEN_INT ((bitsize - INTVAL (operands[2])) % bitsize);
+})
+
 (define_split
   [(set (match_operand:SWI48 0 "register_operand")
(rotate:SWI48 (match_operand:SWI48 1 "nonimmediate_operand") @@ 
-17236,22 +17249,22 @@
   [(set (match_dup 0)
(zero_extend:DI (rotatert:SI (match_dup 1) (match_dup 2])
 
-(define_insn "*3_1"
+(define_insn "*3_1"
   [(set (match_operand:SWI12 0 "nonimmediate_operand" "=m,r")
(any_rotate:SWI12 (match_operand:SWI12 1 "nonimmediate_operand" "0,rm")
- (match_operand:QI 2 "nonmemory_operand" "c,c")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (, mode, operands, TARGET_APX_NDD)"
+ (match_operand:QI 2 "nonmemory_operand" 
"c,c")))]
+  "ix86_binary_operator_ok (, mode, operands, TARGET_APX_NDD)
+   && "
 {
   bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
   if (operands[2] == const1_rtx
   && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))
-  && !use_ndd)
+  && !use_ndd && !)
 return "{}\t%0";
   else
 return use_ndd
-  ? "{}\t{%2, %1, %0|%0, %1, %2}"
-  : "{}\t{%2, %0|%0, %2}";
+  ? "{}\t{%2, %1, %0|%0, %1, %2}"
+  : "{}\t{%2, %0|%0, %2}";
 }
   [(set_attr "isa" "*,apx_ndd")
(set_attr "type" "rotate")
diff --git a/gcc/testsuite/gcc.target/i386/apx-nf.c 
b/gcc/testsuite/gcc.target/i386/apx-nf.c
index 608dbf8f5f7..6e59803be64 100644
--- a/gcc/testsuite/gcc.target/i386/apx-nf.c
+++ b/gcc/testsuite/gcc.target/i386/apx-nf.c
@@ -3,6 +3,7 @@
 /* { dg-final { scan-assembler-times "\{nf\} add" 4 } } */
 /* { dg-final { scan-assembler-times "\{nf\} and" 1 } } */
 /* { dg-final { scan-assembler-times "\{nf\} or" 1 } } */
+/* { dg-final { scan-assembler-times "\{nf\} rol" 4 } } */
 
 

[PATCH v2 4/8] [APX NF] Support APX NF for right shift insns

2024-05-22 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (*ashr3_1_nf): New.
(*lshr3_1_nf): Ditto.
(*lshrqi3_1_nf): Ditto.
(*lshrhi3_1_nf): Ditto.
---
 gcc/config/i386/i386.md | 82 +++--
 1 file changed, 46 insertions(+), 36 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
271d449d7c4..7f191749342 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -16308,13 +16308,13 @@
   [(set_attr "type" "ishiftx")
(set_attr "mode" "")])
 
-(define_insn "*ashr3_1"
+(define_insn "*ashr3_1"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r")
(ashiftrt:SWI48
  (match_operand:SWI48 1 "nonimmediate_operand" "0,rm,rm")
- (match_operand:QI 2 "nonmemory_operand" "c,r,c")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (ASHIFTRT, mode, operands, TARGET_APX_NDD)"
+ (match_operand:QI 2 "nonmemory_operand" "c,r,c")))]
+  "ix86_binary_operator_ok (ASHIFTRT, mode, operands, TARGET_APX_NDD)
+   && "
 {
   bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
   switch (get_attr_type (insn))
@@ -16325,11 +16325,11 @@
 default:
   if (operands[2] == const1_rtx
  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))
- && !use_ndd)
+ && !use_ndd && !)
return "sar{}\t%0";
   else
-   return use_ndd ? "sar{}\t{%2, %1, %0|%0, %1, %2}"
-  : "sar{}\t{%2, %0|%0, %2}";
+   return use_ndd ? "sar{}\t{%2, %1, %0|%0, %1, 
%2}"
+  : "sar{}\t{%2, %0|%0, %2}";
 }
 }
   [(set_attr "isa" "*,bmi2,apx_ndd")
@@ -16369,14 +16369,13 @@
 }
 [(set_attr "isa" "*,*,*,apx_ndd")])
 
-
-(define_insn "*lshr3_1"
+(define_insn "*lshr3_1"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,?k,r")
(lshiftrt:SWI48
  (match_operand:SWI48 1 "nonimmediate_operand" "0,rm,k,rm")
- (match_operand:QI 2 "nonmemory_operand" "c,r,,c")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (LSHIFTRT, mode, operands, TARGET_APX_NDD)"
+ (match_operand:QI 2 "nonmemory_operand" "c,r,,c")))]
+  "ix86_binary_operator_ok (LSHIFTRT, mode, operands, TARGET_APX_NDD)
+   && "
 {
   bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
   switch (get_attr_type (insn))
@@ -16388,11 +16387,11 @@
 default:
   if (operands[2] == const1_rtx
  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))
- && !use_ndd)
+ && !use_ndd && !)
return "shr{}\t%0";
   else
-   return use_ndd ? "shr{}\t{%2, %1, %0|%0, %1, %2}"
-  : "shr{}\t{%2, %0|%0, %2}";
+   return use_ndd ? "shr{}\t{%2, %1, %0|%0, %1, 
%2}"
+  : "shr{}\t{%2, %0|%0, %2}";
 }
 }
   [(set_attr "isa" "*,bmi2,avx512bw,apx_ndd") @@ -16408,6 +16407,17 @@
(set_attr "mode" "")])
 
 ;; Convert shift to the shiftx pattern to avoid flags dependency.
+;; For NF/NDD doesn't support shift count as r, it just support c, 
+;; but it has no flag.
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand")
+   (any_shiftrt:SWI48 (match_operand:SWI48 1 "nonimmediate_operand")
+  (match_operand:QI 2 "register_operand")))]
+  "TARGET_BMI2 && reload_completed"
+  [(set (match_dup 0)
+   (any_shiftrt:SWI48 (match_dup 1) (match_dup 2)))]
+  "operands[2] = gen_lowpart (mode, operands[2]);")
+
 (define_split
   [(set (match_operand:SWI48 0 "register_operand")
(any_shiftrt:SWI48 (match_operand:SWI48 1 "nonimmediate_operand") @@ 
-16476,22 +16486,22 @@
(zero_extend:DI (any_shiftrt:SI (match_dup 1) (match_dup 2]
   "operands[2] = gen_lowpart (SImode, operands[2]);")
 
-(define_insn "*ashr3_1"
+(define_insn "*ashr3_1"
   [(set (match_operand:SWI12 0 "nonimmediate_operand" "=m, r")
(ashiftrt:SWI12
  (match_operand:SWI12 1 "nonimmediate_operand" "0, rm")
- (match_operand:QI 2 "nonmemory_operand" "c, c")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (ASHIFTRT, mode, operands, TARGET_APX_NDD)"
+ (match_operand:QI 2 "nonmemory_operand" "c, c")))]
+  "ix86_binary_operator_ok (ASHIFTRT, mode, operands, TARGET_APX_NDD)
+   && "
 {
   bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
   if (operands[2] == const1_rtx
   && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))
-  && !use_ndd)
+  && !use_ndd && !)
 return "sar{}\t%0";
   else
-return use_ndd ? "sar{}\t{%2, %1, %0|%0, %1, %2}"
-  : "sar{}\t{%2, %0|%0, %2}";
+return use_ndd ? "sar{}\t{%2, %1, %0|%0, %1, %2}"
+  : "sar{}\t{%2, %0|%0, %2}";
 }
   [(set_attr "isa" "*, apx_ndd")
(set_attr "type" "ishift")
@@ -16504,13 +16514,13 @@
(const_string "*")))
(set_attr "mode" "")])
 
-(define_insn "*lshrqi3_1"
+(define_insn "*lshrqi3_1"
   [(set (match_operand:QI 0 "nonimmediate_operand"  "=qm,?k,r")
(lshiftrt:QI
  

[PATCH v2 3/8] [APX NF] Support APX NF for left shift insns

2024-05-22 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (*ashl3_1_nf): New.
(*ashlhi3_1_nf): Ditto.
(*ashlqi3_1_nf): Ditto.
* config/i386/sse.md: New define_split.
---
 gcc/config/i386/i386.md | 80 +++--
 gcc/config/i386/sse.md  | 13 +++
 2 files changed, 67 insertions(+), 26 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
099d7f35c8f..271d449d7c4 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -15012,12 +15012,12 @@
   [(set_attr "type" "ishiftx")
(set_attr "mode" "")])
 
-(define_insn "*ashl3_1"
+(define_insn "*ashl3_1"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r,?k,r")
(ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" 
"0,l,rm,k,rm")
- (match_operand:QI 2 "nonmemory_operand" 
"c,M,r,,c")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (ASHIFT, mode, operands, TARGET_APX_NDD)"
+ (match_operand:QI 2 "nonmemory_operand" 
"c,M,r,,c")))]
+  "ix86_binary_operator_ok (ASHIFT, mode, operands, TARGET_APX_NDD)
+   && "
 {
   bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
   switch (get_attr_type (insn))
@@ -15030,7 +15030,7 @@
 case TYPE_ALU:
   gcc_assert (operands[2] == const1_rtx);
   gcc_assert (rtx_equal_p (operands[0], operands[1]));
-  return "add{}\t%0, %0";
+  return "add{}\t%0, %0";
 
 default:
   if (operands[2] == const1_rtx
@@ -15038,11 +15038,11 @@
  /* For NDD form instructions related to TARGET_SHIFT1, the $1
 immediate do not need to be omitted as assembler will map it
 to use shorter encoding. */
- && !use_ndd)
+ && !use_ndd && !)
return "sal{}\t%0";
   else
-   return use_ndd ? "sal{}\t{%2, %1, %0|%0, %1, %2}"
-  : "sal{}\t{%2, %0|%0, %2}";
+   return use_ndd ? "sal{}\t{%2, %1, %0|%0, %1, 
%2}"
+  : "sal{}\t{%2, %0|%0, %2}";
 }
 }
   [(set_attr "isa" "*,*,bmi2,avx512bw,apx_ndd") @@ -15073,6 +15073,17 @@
(set_attr "mode" "")])
 
 ;; Convert shift to the shiftx pattern to avoid flags dependency.
+;; For NF/NDD doesn't support shift count as r, it just support c, 
+;; but it has no flag.
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand")
+   (ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand")
+ (match_operand:QI 2 "register_operand")))]
+  "TARGET_BMI2 && reload_completed"
+  [(set (match_dup 0)
+   (ashift:SWI48 (match_dup 1) (match_dup 2)))]
+  "operands[2] = gen_lowpart (mode, operands[2]);")
+
 (define_split
   [(set (match_operand:SWI48 0 "register_operand")
(ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand") @@ 
-15159,12 +15170,12 @@
(zero_extend:DI (ashift:SI (match_dup 1) (match_dup 2]
   "operands[2] = gen_lowpart (SImode, operands[2]);")
 
-(define_insn "*ashlhi3_1"
+(define_insn "*ashlhi3_1"
   [(set (match_operand:HI 0 "nonimmediate_operand" "=rm,Yp,?k,r")
(ashift:HI (match_operand:HI 1 "nonimmediate_operand" "0,l,k,rm")
-  (match_operand:QI 2 "nonmemory_operand" "cI,M,Ww,cI")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (ASHIFT, HImode, operands, TARGET_APX_NDD)"
+  (match_operand:QI 2 "nonmemory_operand" "cI,M,Ww,cI")))]
+  "ix86_binary_operator_ok (ASHIFT, HImode, operands, TARGET_APX_NDD)
+   && "
 {
   bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
   switch (get_attr_type (insn))
@@ -15175,16 +15186,16 @@
 
 case TYPE_ALU:
   gcc_assert (operands[2] == const1_rtx);
-  return "add{w}\t%0, %0";
+  return "add{w}\t%0, %0";
 
 default:
   if (operands[2] == const1_rtx
  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))
- && !use_ndd)
+ && !use_ndd && !)
return "sal{w}\t%0";
   else
-   return use_ndd ? "sal{w}\t{%2, %1, %0|%0, %1, %2}"
-  : "sal{w}\t{%2, %0|%0, %2}";
+   return use_ndd ? "sal{w}\t{%2, %1, %0|%0, %1, %2}"
+  : "sal{w}\t{%2, %0|%0, %2}";
 }
 }
   [(set_attr "isa" "*,*,avx512f,apx_ndd") @@ -15212,12 +15223,12 @@
(const_string "*")))
(set_attr "mode" "HI,SI,HI,HI")])
 
-(define_insn "*ashlqi3_1"
+(define_insn "*ashlqi3_1"
   [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,Yp,?k,r")
(ashift:QI (match_operand:QI 1 "nonimmediate_operand" "0,0,l,k,rm")
-  (match_operand:QI 2 "nonmemory_operand" "cI,cI,M,Wb,cI")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (ASHIFT, QImode, operands, TARGET_APX_NDD)"
+  (match_operand:QI 2 "nonmemory_operand" "cI,cI,M,Wb,cI")))]
+  "ix86_binary_operator_ok (ASHIFT, QImode, operands, TARGET_APX_NDD)
+   && "
 {
   bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
   switch (get_attr_type (insn))
@@ -15229,14 +15240,14 @@
 case TYPE_ALU:
   

[PATCH v2 2/8] [APX NF] Support APX NF for {sub/and/or/xor/neg}

2024-05-22 Thread Kong, Lingling
gcc/ChangeLog:

   * config/i386/i386.md (nf_and_applied): New subst_attr.
   (nf_x64_and_applied): Ditto.
   (*sub_1_nf): New define_insn.
   (*anddi_1_nf): Ditto.
   (*and_1_nf): Ditto.
   (*qi_1_nf): Ditto.
   (*_1_nf): Ditto.
   (*neg_1_nf): Ditto.
   * config/i386/sse.md : New define_split.

gcc/testsuite/ChangeLog:

   * gcc.target/i386/apx-nf.c: Add test.
---
gcc/config/i386/i386.md| 174 +
gcc/config/i386/sse.md |  11 ++
gcc/testsuite/gcc.target/i386/apx-nf.c |   9 ++
3 files changed, 112 insertions(+), 82 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index bae344518bd..099d7f35c8f 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -575,7 +575,7 @@
 
noavx512dq,fma_or_avx512vl,avx512vl,noavx512vl,avxvnni,
 
avx512vnnivl,avx512fp16,avxifma,avx512ifmavl,avxneconvert,
 
avx512bf16vl,vpclmulqdqvl,avx_noavx512f,avx_noavx512vl,
- vaes_avx512vl"
+vaes_avx512vl,noapx_nf"
   (const_string "base"))
 ;; The (bounding maximum) length of an instruction immediate.
@@ -981,6 +981,7 @@
 (symbol_ref "TARGET_MMX_WITH_SSE && !TARGET_AVX")
   (eq_attr "mmx_isa" "avx")
 (symbol_ref "TARGET_MMX_WITH_SSE && TARGET_AVX")
+ (eq_attr "isa" "noapx_nf") (symbol_ref "!TARGET_APX_NF")
  ]
  (const_int 1)))
@@ -7893,20 +7894,21 @@
   "split_double_mode (mode, [0], 2, [0], [3]);"
[(set_attr "isa" "*,*,apx_ndd,apx_ndd")])
-(define_insn "*sub_1"
-  [(set (match_operand:SWI 0 "nonimmediate_operand" "=m,,r,r,r")
+(define_insn "*sub_1"
+  [(set (match_operand:SWI 0 "nonimmediate_operand" 
"=m,r,,r,r,r")
  (minus:SWI
-(match_operand:SWI 1 "nonimmediate_operand" "0,0,rm,rjM,r")
-(match_operand:SWI 2 "" 
",,r,,")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (MINUS, mode, operands, TARGET_APX_NDD)"
+   (match_operand:SWI 1 "nonimmediate_operand" "0,0,0,rm,rjM,r")
+   (match_operand:SWI 2 "" 
",,,r,,")))]
+  "ix86_binary_operator_ok (MINUS, mode, operands, TARGET_APX_NDD)
+  && "
   "@
-  sub{}\t{%2, %0|%0, %2}
-  sub{}\t{%2, %0|%0, %2}
-  sub{}\t{%2, %1, %0|%0, %1, %2}
-  sub{}\t{%2, %1, %0|%0, %1, %2}
-  sub{}\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "isa" "*,*,apx_ndd,apx_ndd,apx_ndd")
+  sub{}\t{%2, %0|%0, %2}
+  sub{}\t{%2, %0|%0, %2}
+  sub{}\t{%2, %0|%0, %2}
+  sub{}\t{%2, %1, %0|%0, %1, %2}
+  sub{}\t{%2, %1, %0|%0, %1, %2}
+  sub{}\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "*,*,*,apx_ndd,apx_ndd,apx_ndd")
(set_attr "type" "alu")
(set_attr "mode" "")])
@@ -11795,27 +11797,31 @@
}
[(set_attr "isa" "*,*,apx_ndd,apx_ndd,apx_ndd,apx_ndd_64,apx_ndd")])
-(define_insn "*anddi_1"
-  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,r,rm,r,r,r,r,r,?k")
+(define_subst_attr "nf_and_applied" "nf_subst"  "noapx_nf" "*")
+(define_subst_attr "nf_x64_and_applied" "nf_subst" "noapx_nf" "x64")
+
+(define_insn "*anddi_1"
+  [(set (match_operand:DI 0 "nonimmediate_operand" 
"=r,r,rm,r,r,r,r,r,r,?k")
  (and:DI
-  (match_operand:DI 1 "nonimmediate_operand" 
"%0,r,0,0,rm,rjM,r,qm,k")
-  (match_operand:DI 2 "x86_64_szext_general_operand" 
"Z,Z,re,m,r,e,m,L,k")))
-   (clobber (reg:CC FLAGS_REG))]
+ (match_operand:DI 1 "nonimmediate_operand" 
"%0,r,0,0,0,rm,rjM,r,qm,k")
+ (match_operand:DI 2 "x86_64_szext_general_operand" 
"Z,Z,r,e,m,r,e,m,L,k")))]
   "TARGET_64BIT
-   && ix86_binary_operator_ok (AND, DImode, operands, TARGET_APX_NDD)"
+   && ix86_binary_operator_ok (AND, DImode, operands, TARGET_APX_NDD)
+   && "
   "@
-   and{l}\t{%k2, %k0|%k0, %k2}
-   and{l}\t{%k2, %k1, %k0|%k0, %k1, %k2}
-   and{q}\t{%2, %0|%0, %2}
-   and{q}\t{%2, %0|%0, %2}
-   and{q}\t{%2, %1, %0|%0, %1, %2}
-   and{q}\t{%2, %1, %0|%0, %1, %2}
-   and{q}\t{%2, %1, %0|%0, %1, %2}
+   and{l}\t{%k2, %k0|%k0, %k2}
+   and{l}\t{%k2, %k1, %k0|%k0, %k1, %k2}
+   and{q}\t{%2, %0|%0, %2}
+   and{q}\t{%2, %0|%0, %2}
+   and{q}\t{%2, %0|%0, %2}
+   and{q}\t{%2, %1, %0|%0, %1, %2}
+   and{q}\t{%2, %1, %0|%0, %1, %2}
+   and{q}\t{%2, %1, %0|%0, %1, %2}
#
#"
-  [(set_attr "isa" "x64,apx_ndd,x64,x64,apx_ndd,apx_ndd,apx_ndd,x64,avx512bw")
-   (set_attr "type" "alu,alu,alu,alu,alu,alu,alu,imovx,msklog")
-   (set_attr "length_immediate" "*,*,*,*,*,*,*,0,*")
+  [(set_attr "isa" 
"x64,apx_ndd,x64,x64,x64,apx_ndd,apx_ndd,apx_ndd,,avx512bw")
+   (set_attr "type" "alu,alu,alu,alu,alu,alu,alu,alu,imovx,msklog")
+   (set_attr "length_immediate" "*,*,*,*,*,*,*,*,0,*")
(set (attr "prefix_rex")
  (if_then_else
(and (eq_attr "type" "imovx")
@@ -11823,7 +11829,7 @@
 

[PATCH v2 1/8] [APX NF]: Support APX NF add

2024-05-22 Thread Kong, Lingling
> I wonder if we can use "define_subst" to conditionally add flags clobber
> for !TARGET_APX_NF targets. Even the example for "Define Subst" uses the insn
> w/ and w/o the clobber, so I think it is worth considering this approach.
> 
> Uros.

Good Suggestion, I defined new subst for no flags, and Bootstrapped and 
regtested on x86_64-linux-gnu. Also supported SPEC 2017 run normally on Intel 
software development emulator.
Ok for trunk?

Thanks,
Lingling

Subject: [PATCH v2 1/8] [APX NF]: Support APX NF add
APX NF(no flags) feature implements suppresses the update of status flags
for arithmetic operations.

For NF add, it is not clear whether nf add can be faster than lea. If so,
the pattern needs to be adjusted to perfer lea generation.

gcc/ChangeLog:

* config/i386/i386-opts.h (enum apx_features): Add nf
enumeration.
* config/i386/i386.h (TARGET_APX_NF): New.
* config/i386/i386.md (nf_subst): New define_subst.
(nf_name): New subst_attr.
(nf_prefix): Ditto.
(nf_condition): Ditto.
(nf_mem_constraint): Ditto.
(nf_applied): Ditto.
(*add_1_nf): New define_insn.
(addhi_1_nf): Ditto.
(addqi_1_nf): Ditto.
* config/i386/i386.opt: Add apx_nf enumeration.

gcc/testsuite/ChangeLog:

* gcc.target/i386/apx-ndd.c: Fixed test.
* gcc.target/i386/apx-nf.c: New test.

Co-authored-by: Lingling Kong 
---
 gcc/config/i386/i386-opts.h |   3 +-
 gcc/config/i386/i386.h  |   1 +
 gcc/config/i386/i386.md | 179 +++-
 gcc/config/i386/i386.opt|   3 +
 gcc/testsuite/gcc.target/i386/apx-ndd.c |   2 +-
 gcc/testsuite/gcc.target/i386/apx-nf.c  |   6 +
 6 files changed, 126 insertions(+), 68 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/apx-nf.c

diff --git a/gcc/config/i386/i386-opts.h b/gcc/config/i386/i386-opts.h
index ef2825803b3..60176ce609f 100644
--- a/gcc/config/i386/i386-opts.h
+++ b/gcc/config/i386/i386-opts.h
@@ -140,7 +140,8 @@ enum apx_features {
   apx_push2pop2 = 1 << 1,
   apx_ndd = 1 << 2,
   apx_ppx = 1 << 3,
-  apx_all = apx_egpr | apx_push2pop2 | apx_ndd | apx_ppx,
+  apx_nf = 1<< 4,
+  apx_all = apx_egpr | apx_push2pop2 | apx_ndd | apx_ppx | apx_nf,
 };
 
 #endif
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 529edff93a4..f20ae4726da 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -55,6 +55,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If 
not, see
 #define TARGET_APX_PUSH2POP2 (ix86_apx_features & apx_push2pop2)
 #define TARGET_APX_NDD (ix86_apx_features & apx_ndd)
 #define TARGET_APX_PPX (ix86_apx_features & apx_ppx)
+#define TARGET_APX_NF (ix86_apx_features & apx_nf)
 
 #include "config/vxworks-dummy.h"
 
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 764bfe20ff2..bae344518bd 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -6233,28 +6233,6 @@
 }
 })
 

-;; Load effective address instructions
-
-(define_insn "*lea"
-  [(set (match_operand:SWI48 0 "register_operand" "=r")
-   (match_operand:SWI48 1 "address_no_seg_operand" "Ts"))]
-  "ix86_hardreg_mov_ok (operands[0], operands[1])"
-{
-  if (SImode_address_operand (operands[1], VOIDmode))
-{
-  gcc_assert (TARGET_64BIT);
-  return "lea{l}\t{%E1, %k0|%k0, %E1}";
-}
-  else
-return "lea{}\t{%E1, %0|%0, %E1}";
-}
-  [(set_attr "type" "lea")
-   (set (attr "mode")
- (if_then_else
-   (match_operand 1 "SImode_address_operand")
-   (const_string "SI")
-   (const_string "")))])
-
 (define_peephole2
   [(set (match_operand:SWI48 0 "register_operand")
(match_operand:SWI48 1 "address_no_seg_operand"))]
@@ -6290,6 +6268,13 @@
   [(parallel [(set (match_dup 0) (ashift:SWI48 (match_dup 0) (match_dup 1)))
   (clobber (reg:CC FLAGS_REG))])]
   "operands[1] = GEN_INT (exact_log2 (INTVAL (operands[1])));")
+
+(define_split
+  [(set (match_operand:SWI48 0 "general_reg_operand")
+   (mult:SWI48 (match_dup 0) (match_operand:SWI48 1 "const1248_operand")))]
+  "TARGET_APX_NF && reload_completed"
+  [(set (match_dup 0) (ashift:SWI48 (match_dup 0) (match_dup 1)))]
+  "operands[1] = GEN_INT (exact_log2 (INTVAL (operands[1])));")
 

 ;; Add instructions
 
@@ -6437,48 +6422,65 @@
  (clobber (reg:CC FLAGS_REG))])]
  "split_double_mode (mode, [0], 1, [0], [5]);")
 
-(define_insn "*add_1"
-  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r,r,r,r,r")
+(define_subst_attr "nf_name" "nf_subst" "_nf" "")
+(define_subst_attr "nf_prefix" "nf_subst" "%{nf%} " "")
+(define_subst_attr "nf_condition" "nf_subst" "TARGET_APX_NF" "true")
+(define_subst_attr "nf_mem_constraint" "nf_subst" "je" "m")
+(define_subst_attr "nf_applied" "nf_subst" "true" "false")
+
+(define_subst "nf_subst"
+  [(set (match_operand:SWI 0)
+(match_operand:SWI 1))]
+  ""
+  [(set (match_dup 0)
+   (match_dup 1))
+   

RE: [PATCH 1/8] [APX NF]: Support APX NF add

2024-05-15 Thread Kong, Lingling
> -Original Message-
> From: Uros Bizjak 
> Sent: Wednesday, May 15, 2024 4:15 PM
> To: Kong, Lingling 
> Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao ; Wang,
> Hongyu 
> Subject: Re: [PATCH 1/8] [APX NF]: Support APX NF add
> 
> On Wed, May 15, 2024 at 9:43 AM Kong, Lingling 
> wrote:
> >
> > From: Hongyu Wang 
> >
> > APX NF(no flags) feature implements suppresses the update of status flags 
> > for
> arithmetic operations.
> >
> > For NF add, it is not clear whether NF add can be faster than lea. If so, 
> > the
> pattern needs to be adjusted to prefer LEA generation.
> 
> > diff --git a/gcc/testsuite/gcc.target/i386/apx-ndd.c
> > b/gcc/testsuite/gcc.target/i386/apx-ndd.c
> > index 0eb751ad225..0ff4df0780c 100644
> > --- a/gcc/testsuite/gcc.target/i386/apx-ndd.c
> > +++ b/gcc/testsuite/gcc.target/i386/apx-ndd.c
> > @@ -1,5 +1,5 @@
> >  /* { dg-do compile { target { ! ia32 } } } */
> > -/* { dg-options "-mapxf -march=x86-64 -O2" } */
> > +/* { dg-options "-mapx-features=egpr,push2pop2,ndd,ppx -march=x86-64
> > +-O2" } */
> 
> Please do not split options to a separate line; here and in other places.
> 
> Uros.

Sorry,  my send-email adjusted some formatting incorrectly, I added attachments.

Thanks, 
Lingling



0004-APX-NF-Support-APX-NF-for-right-shift-insns.patch
Description: 0004-APX-NF-Support-APX-NF-for-right-shift-insns.patch


0005-APX-NF-Support-APX-NF-for-rotate-insns.patch
Description: 0005-APX-NF-Support-APX-NF-for-rotate-insns.patch


0006-APX-NF-Support-APX-NF-for-shld-shrd.patch
Description: 0006-APX-NF-Support-APX-NF-for-shld-shrd.patch


0007-APX-NF-Support-APX-NF-for-mul-div.patch
Description: 0007-APX-NF-Support-APX-NF-for-mul-div.patch


0008-APX-NF-Support-APX-NF-for-lzcnt-tzcnt-popcnt.patch
Description: 0008-APX-NF-Support-APX-NF-for-lzcnt-tzcnt-popcnt.patch


0001-APX-NF-Support-APX-NF-add.patch
Description: 0001-APX-NF-Support-APX-NF-add.patch


0002-APX-NF-Support-APX-NF-for-sub-and-or-xor-neg.patch
Description: 0002-APX-NF-Support-APX-NF-for-sub-and-or-xor-neg.patch


0003-APX-NF-Support-APX-NF-for-left-shift-insns.patch
Description: 0003-APX-NF-Support-APX-NF-for-left-shift-insns.patch


[PATCH 8/8] [APX NF] Support APX NF for lzcnt/tzcnt/popcnt

2024-05-15 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (clz2_lzcnt_nf): New define_insn.
(*clz2_lzcnt_falsedep_nf): Ditto.
(__nf): Ditto.
(*__falsedep_nf): Ditto.
(_hi_nf): Ditto.
(popcount2_nf): Ditto.
(*popcount2_falsedep_nf): Ditto.
(popcounthi2_nf): Ditto.
---
 gcc/config/i386/i386.md | 132 
 1 file changed, 132 insertions(+)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
55f65a31b16..ddde83e57f5 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -21029,6 +21029,24 @@
   operands[3] = gen_reg_rtx (mode);
 })
 
+(define_insn_and_split "clz2_lzcnt_nf"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+   (clz:SWI48
+ (match_operand:SWI48 1 "nonimmediate_operand" "rm")))]
+  "TARGET_APX_NF && TARGET_LZCNT"
+  "%{nf%} lzcnt{}\t{%1, %0|%0, %1}"
+  "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed
+   && optimize_function_for_speed_p (cfun)
+   && !reg_mentioned_p (operands[0], operands[1])"
+  [(parallel
+[(set (match_dup 0)
+ (clz:SWI48 (match_dup 1)))
+ (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "ix86_expand_clear (operands[0]);"
+  [(set_attr "prefix_rep" "1")
+   (set_attr "type" "bitmanip")
+   (set_attr "mode" "")])
+
 (define_insn_and_split "clz2_lzcnt"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
(clz:SWI48
@@ -21052,6 +21070,18 @@
 ; False dependency happens when destination is only updated by tzcnt,  ; lzcnt 
or popcnt.  There is no false dependency when destination is  ; also used in 
source.
+(define_insn "*clz2_lzcnt_falsedep_nf"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+   (clz:SWI48
+ (match_operand:SWI48 1 "nonimmediate_operand" "rm")))
+   (unspec [(match_operand:SWI48 2 "register_operand" "0")]
+  UNSPEC_INSN_FALSE_DEP)]
+  "TARGET_APX_NF && TARGET_LZCNT"
+  "%{nf%} lzcnt{}\t{%1, %0|%0, %1}"
+  [(set_attr "prefix_rep" "1")
+   (set_attr "type" "bitmanip")
+   (set_attr "mode" "")])
+
 (define_insn "*clz2_lzcnt_falsedep"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
(clz:SWI48
@@ -21158,6 +21188,25 @@
 ;; Version of lzcnt/tzcnt that is expanded from intrinsics.  This version  ;; 
provides operand size as output when source operand is zero. 
 
+(define_insn_and_split "__nf"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+   (unspec:SWI48
+ [(match_operand:SWI48 1 "nonimmediate_operand" "rm")] LT_ZCNT))]
+  "TARGET_APX_NF"
+  "%{nf%} {}\t{%1, %0|%0, %1}"
+  "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed
+   && optimize_function_for_speed_p (cfun)
+   && !reg_mentioned_p (operands[0], operands[1])"
+  [(parallel
+[(set (match_dup 0)
+ (unspec:SWI48 [(match_dup 1)] LT_ZCNT))
+ (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "ix86_expand_clear (operands[0]);"
+  [(set_attr "type" "")
+   (set_attr "prefix_0f" "1")
+   (set_attr "prefix_rep" "1")
+   (set_attr "mode" "")])
+
 (define_insn_and_split "_"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
(unspec:SWI48
@@ -21182,6 +21231,20 @@
 ; False dependency happens when destination is only updated by tzcnt,  ; lzcnt 
or popcnt.  There is no false dependency when destination is  ; also used in 
source.
+; also used in source.
+(define_insn "*__falsedep_nf"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+   (unspec:SWI48
+ [(match_operand:SWI48 1 "nonimmediate_operand" "rm")] LT_ZCNT))
+   (unspec [(match_operand:SWI48 2 "register_operand" "0")]
+  UNSPEC_INSN_FALSE_DEP)]
+  "TARGET_APX_NF"
+  "%{nf%} {}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "")
+   (set_attr "prefix_0f" "1")
+   (set_attr "prefix_rep" "1")
+   (set_attr "mode" "")])
+
 (define_insn "*__falsedep"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
(unspec:SWI48
@@ -21196,6 +21259,17 @@
(set_attr "prefix_rep" "1")
(set_attr "mode" "")])
 
+(define_insn "_hi_nf"
+  [(set (match_operand:HI 0 "register_operand" "=r")
+   (unspec:HI
+ [(match_operand:HI 1 "nonimmediate_operand" "rm")] LT_ZCNT))]
+  "TARGET_APX_NF"
+  "%{nf%} {w}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "")
+   (set_attr "prefix_0f" "1")
+   (set_attr "prefix_rep" "1")
+   (set_attr "mode" "HI")])
+
 (define_insn "_hi"
   [(set (match_operand:HI 0 "register_operand" "=r")
(unspec:HI
@@ -21620,6 +21694,30 @@
   [(set_attr "type" "bitmanip")
(set_attr "mode" "")])
 
+(define_insn_and_split "popcount2_nf"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+   (popcount:SWI48
+ (match_operand:SWI48 1 "nonimmediate_operand" "rm")))]
+  "TARGET_APX_NF && TARGET_POPCNT"
+{
+#if TARGET_MACHO
+  return "%{nf%} popcnt\t{%1, %0|%0, %1}"; #else
+  return "%{nf%} popcnt{}\t{%1, %0|%0, %1}"; #endif }
+  "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed
+   && optimize_function_for_speed_p (cfun)
+   && 

[PATCH 6/8] [APX NF] Support APX NF for shld/shrd

2024-05-15 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (x86_64_shld_nf): New define_insn.
(x86_64_shld_ndd_nf): Ditto.
(x86_64_shld_1_nf): Ditto.
(x86_64_shld_ndd_1_nf): Ditto.
(*x86_64_shld_shrd_1_nozext_nf): Ditto.
(x86_shld_nf): Ditto.
(x86_shld_ndd_nf): Ditto.
(x86_shld_1_nf): Ditto.
(x86_shld_ndd_1_nf): Ditto.
(*x86_shld_shrd_1_nozext_nf): Ditto.
(3_doubleword_lowpart_nf): Ditto.
(x86_64_shrd_nf): Ditto.
(x86_64_shrd_ndd_nf): Ditto.
(x86_64_shrd_1_nf): Ditto.
(x86_64_shrd_ndd_1_nf): Ditto.
(*x86_64_shrd_shld_1_nozext_nf): Ditto.
(x86_shrd_nf): Ditto.
(x86_shrd_ndd_nf): Ditto.
(x86_shrd_1_nf): Ditto.
(x86_shrd_ndd_1_nf): Ditto.
(*x86_shrd_shld_1_nozext_nf): Ditto.
---
 gcc/config/i386/i386.md | 518 
 1 file changed, 518 insertions(+)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
ff44154b26b..f9a62fba0c4 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -14666,6 +14666,26 @@
   DONE;
 })
 
+(define_insn "x86_64_shld_nf"
+  [(set (match_operand:DI 0 "nonimmediate_operand" "+r*m")
+(ior:DI (ashift:DI (match_dup 0)
+ (and:QI (match_operand:QI 2 "nonmemory_operand" "Jc")
+ (const_int 63)))
+   (subreg:DI
+ (lshiftrt:TI
+   (zero_extend:TI
+ (match_operand:DI 1 "register_operand" "r"))
+   (minus:QI (const_int 64)
+ (and:QI (match_dup 2) (const_int 63 0)))]
+  "TARGET_APX_NF"
+  "%{nf%} shld{q}\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ishift")
+   (set_attr "prefix_0f" "1")
+   (set_attr "mode" "DI")
+   (set_attr "athlon_decode" "vector")
+   (set_attr "amdfam10_decode" "vector")
+   (set_attr "bdver1_decode" "vector")])
+
 (define_insn "x86_64_shld"
   [(set (match_operand:DI 0 "nonimmediate_operand" "+r*m")
 (ior:DI (ashift:DI (match_dup 0) @@ -14687,6 +14707,22 @@
(set_attr "amdfam10_decode" "vector")
(set_attr "bdver1_decode" "vector")])
 
+(define_insn "x86_64_shld_ndd_nf"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+(ior:DI (ashift:DI (match_operand:DI 1 "nonimmediate_operand" "rm")
+ (and:QI (match_operand:QI 3 "nonmemory_operand" "Jc")
+ (const_int 63)))
+   (subreg:DI
+ (lshiftrt:TI
+   (zero_extend:TI
+ (match_operand:DI 2 "register_operand" "r"))
+   (minus:QI (const_int 64)
+ (and:QI (match_dup 3) (const_int 63 0)))]
+  "TARGET_APX_NDD && TARGET_APX_NF"
+  "%{nf%} shld{q}\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ishift")
+   (set_attr "mode" "DI")])
+
 (define_insn "x86_64_shld_ndd"
   [(set (match_operand:DI 0 "register_operand" "=r")
 (ior:DI (ashift:DI (match_operand:DI 1 "nonimmediate_operand" "rm") @@ 
-14704,6 +14740,43 @@
   [(set_attr "type" "ishift")
(set_attr "mode" "DI")])
 
+(define_insn "x86_64_shld_1_nf"
+  [(set (match_operand:DI 0 "nonimmediate_operand" "+r*m")
+(ior:DI (ashift:DI (match_dup 0)
+  (match_operand:QI 2 "const_0_to_63_operand"))
+   (subreg:DI
+ (lshiftrt:TI
+   (zero_extend:TI
+ (match_operand:DI 1 "register_operand" "r"))
+   (match_operand:QI 3 "const_0_to_255_operand")) 0)))]
+  "TARGET_64BIT
+   && INTVAL (operands[3]) == 64 - INTVAL (operands[2])
+   && TARGET_APX_NF"
+  "%{nf%} shld{q}\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ishift")
+   (set_attr "prefix_0f" "1")
+   (set_attr "mode" "DI")
+   (set_attr "length_immediate" "1")
+   (set_attr "athlon_decode" "vector")
+   (set_attr "amdfam10_decode" "vector")
+   (set_attr "bdver1_decode" "vector")])
+
+(define_insn "x86_64_shld_ndd_1_nf"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+(ior:DI (ashift:DI (match_operand:DI 1 "nonimmediate_operand" "rm")
+  (match_operand:QI 3 "const_0_to_63_operand"))
+   (subreg:DI
+ (lshiftrt:TI
+   (zero_extend:TI
+ (match_operand:DI 2 "register_operand" "r"))
+   (match_operand:QI 4 "const_0_to_255_operand")) 0)))]
+  "TARGET_APX_NDD && TARGET_APX_NF
+   && INTVAL (operands[4]) == 64 - INTVAL (operands[3])"
+  "%{nf%} shld{q}\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ishift")
+   (set_attr "mode" "DI")
+   (set_attr "length_immediate" "1")])
+
 (define_insn "x86_64_shld_1"
   [(set (match_operand:DI 0 "nonimmediate_operand" "+r*m")
 (ior:DI (ashift:DI (match_dup 0) @@ -14742,6 +14815,58 @@
(set_attr "mode" "DI")
(set_attr "length_immediate" "1")])
 
+(define_insn_and_split "*x86_64_shld_shrd_1_nozext_nf"

[PATCH 7/8] [APX NF] Support APX NF for mul/div

2024-05-15 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (*mul3_1_nf): New define_insn.
(*mulqi3_1_nf): Ditto.
(*divmod4_noext_nf): Ditto.
(divmodhiqi3_nf): Ditto.
---
 gcc/config/i386/i386.md | 86 +
 1 file changed, 86 insertions(+)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
f9a62fba0c4..55f65a31b16 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -9907,6 +9907,42 @@
 ;;
 ;; On BDVER1, all HI MULs use DoublePath
 
+(define_insn "*mul3_1_nf"
+  [(set (match_operand:SWIM248 0 "register_operand" "=r,r,r")
+   (mult:SWIM248
+ (match_operand:SWIM248 1 "nonimmediate_operand" "%rm,rm,0")
+ (match_operand:SWIM248 2 "" "K,,r")))]
+  "TARGET_APX_NF &&
+  !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "@
+   %{nf%} imul{}\t{%2, %1, %0|%0, %1, %2}
+   %{nf%} imul{}\t{%2, %1, %0|%0, %1, %2}
+   %{nf%} imul{}\t{%2, %0|%0, %2}"
+  [(set_attr "type" "imul")
+   (set_attr "prefix_0f" "0,0,1")
+   (set (attr "athlon_decode")
+   (cond [(eq_attr "cpu" "athlon")
+ (const_string "vector")
+  (eq_attr "alternative" "1")
+ (const_string "vector")
+  (and (eq_attr "alternative" "2")
+   (ior (match_test "mode == HImode")
+(match_operand 1 "memory_operand")))
+ (const_string "vector")]
+ (const_string "direct")))
+   (set (attr "amdfam10_decode")
+   (cond [(and (eq_attr "alternative" "0,1")
+   (ior (match_test "mode == HImode")
+(match_operand 1 "memory_operand")))
+ (const_string "vector")]
+ (const_string "direct")))
+   (set (attr "bdver1_decode")
+   (if_then_else
+ (match_test "mode == HImode")
+   (const_string "double")
+   (const_string "direct")))
+   (set_attr "mode" "")])
+
 (define_insn "*mul3_1"
   [(set (match_operand:SWIM248 0 "register_operand" "=r,r,r")
(mult:SWIM248
@@ -9978,6 +10014,24 @@
 ;; MUL reg8Direct
 ;; MUL mem8Direct
 
+(define_insn "*mulqi3_1_nf"
+  [(set (match_operand:QI 0 "register_operand" "=a")
+   (mult:QI (match_operand:QI 1 "nonimmediate_operand" "%0")
+(match_operand:QI 2 "nonimmediate_operand" "qm")))]
+  "TARGET_APX_NF &&
+  TARGET_QIMODE_MATH
+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "%{nf%} mul{b}\t%2"
+  [(set_attr "type" "imul")
+   (set_attr "length_immediate" "0")
+   (set (attr "athlon_decode")
+ (if_then_else (eq_attr "cpu" "athlon")
+(const_string "vector")
+(const_string "direct")))
+   (set_attr "amdfam10_decode" "direct")
+   (set_attr "bdver1_decode" "direct")
+   (set_attr "mode" "QI")])
+
 (define_insn "*mulqi3_1"
   [(set (match_operand:QI 0 "register_operand" "=a")
(mult:QI (match_operand:QI 1 "nonimmediate_operand" "%0") @@ -11128,6 
+11182,19 @@
   [(set_attr "type" "multi")
(set_attr "mode" "SI")])
 
+(define_insn "*divmod4_noext_nf"
+  [(set (match_operand:SWIM248 0 "register_operand" "=a")
+   (any_div:SWIM248
+ (match_operand:SWIM248 2 "register_operand" "0")
+ (match_operand:SWIM248 3 "nonimmediate_operand" "rm")))
+   (set (match_operand:SWIM248 1 "register_operand" "=d")
+   (:SWIM248 (match_dup 2) (match_dup 3)))
+   (use (match_operand:SWIM248 4 "register_operand" "1"))]
+  "TARGET_APX_NF"
+  "%{nf%} div{}\t%3"
+  [(set_attr "type" "idiv")
+   (set_attr "mode" "")])
+
 (define_insn "*divmod4_noext"
   [(set (match_operand:SWIM248 0 "register_operand" "=a")
(any_div:SWIM248
@@ -11275,6 +11342,25 @@
 ;; Change div/mod to HImode and extend the second argument to HImode  ;; so 
that mode of div/mod matches with mode of arguments.  Otherwise  ;; combine may 
fail.
+(define_insn "divmodhiqi3_nf"
+  [(set (match_operand:HI 0 "register_operand" "=a")
+   (ior:HI
+ (ashift:HI
+   (zero_extend:HI
+ (truncate:QI
+   (mod:HI (match_operand:HI 1 "register_operand" "0")
+   (any_extend:HI
+ (match_operand:QI 2 "nonimmediate_operand" "qm")
+   (const_int 8))
+ (zero_extend:HI
+   (truncate:QI
+ (div:HI (match_dup 1) (any_extend:HI (match_dup 2)))]
+  "TARGET_APX_NF
+  && TARGET_QIMODE_MATH"
+  "%{nf%} div{b}\t%2"
+  [(set_attr "type" "idiv")
+   (set_attr "mode" "QI")])
+
 (define_insn "divmodhiqi3"
   [(set (match_operand:HI 0 "register_operand" "=a")
(ior:HI
--
2.31.1



[PATCH 5/8] [APX NF] Support APX NF for rotate insns

2024-05-15 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (ashr3_cvt_nf): New define_insn.
(*3_1_nf): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/apx-nf.c: Add NF test for rotate insns.
---
 gcc/config/i386/i386.md| 80 ++
 gcc/testsuite/gcc.target/i386/apx-nf.c |  5 ++
 2 files changed, 85 insertions(+)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
adcb09fcdd0..ff44154b26b 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -16491,6 +16491,25 @@
 (define_mode_attr cvt_mnemonic
   [(SI "{cltd|cdq}") (DI "{cqto|cqo}")])
 
+(define_insn "ashr3_cvt_nf"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r")
+   (ashiftrt:SWI48
+ (match_operand:SWI48 1 "nonimmediate_operand" "0,rm")
+ (match_operand:QI 2 "const_int_operand")))]
+  "TARGET_APX_NF &&
+   INTVAL (operands[2]) == GET_MODE_BITSIZE (mode)-1
+   && (TARGET_USE_CLTD || optimize_function_for_size_p (cfun))
+   && ix86_binary_operator_ok (ASHIFTRT, mode, operands, TARGET_APX_NDD)"
+  "@
+   %{nf%} sar{}\t{%2, %0|%0, %2}
+   %{nf%} sar{}\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "*,apx_ndd")
+   (set_attr "type" "ishift")
+   (set_attr "prefix_0f" "*")
+   (set_attr "length_immediate" "*")
+   (set_attr "modrm" "1")
+   (set_attr "mode" "")])
+
 (define_insn "ashr3_cvt"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=*d,rm,r")
(ashiftrt:SWI48
@@ -17430,6 +17449,39 @@
   [(set_attr "type" "rotatex")
(set_attr "mode" "")])
 
+(define_insn "*3_1_nf"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r")
+   (any_rotate:SWI48
+ (match_operand:SWI48 1 "nonimmediate_operand" "0,rm")
+ (match_operand:QI 2 "nonmemory_operand" "c,c")))]
+  "TARGET_APX_NF &&
+  ix86_binary_operator_ok (, mode, operands, TARGET_APX_NDD)"
+{
+  bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
+  if (operands[2] == const1_rtx
+  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))
+  && !use_ndd)
+return "%{nf%} {}\t%0";
+  else
+return use_ndd ? "%{nf%} {}\t{%2, %1, %0|%0, %1, %2}"
+  : "%{nf%} {}\t{%2, %0|%0, %2}"; }
+  [(set_attr "isa" "*,apx_ndd")
+   (set_attr "type" "rotate")
+   (set (attr "preferred_for_size")
+ (cond [(eq_attr "alternative" "0")
+ (symbol_ref "true")]
+  (symbol_ref "false")))
+   (set (attr "length_immediate")
+ (if_then_else
+   (and (eq_attr "type" "rotate")
+   (and (match_operand 2 "const1_operand")
+(ior (match_test "TARGET_SHIFT1")
+ (match_test "optimize_function_for_size_p (cfun)"
+   (const_string "0")
+   (const_string "*")))
+   (set_attr "mode" "")])
+
 (define_insn "*3_1"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r")
(any_rotate:SWI48
@@ -17572,6 +17624,34 @@
   [(set (match_dup 0)
(zero_extend:DI (rotatert:SI (match_dup 1) (match_dup 2])
 
+(define_insn "*3_1_nf"
+  [(set (match_operand:SWI12 0 "nonimmediate_operand" "=m,r")
+   (any_rotate:SWI12 (match_operand:SWI12 1 "nonimmediate_operand" "0,rm")
+ (match_operand:QI 2 "nonmemory_operand" 
"c,c")))]
+  "TARGET_APX_NF &&
+  ix86_binary_operator_ok (, mode, operands, TARGET_APX_NDD)"
+{
+  bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
+  if (operands[2] == const1_rtx
+  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))
+  && !use_ndd)
+return "%{nf%} {}\t%0";
+  else
+return use_ndd
+  ? "%{nf%} {}\t{%2, %1, %0|%0, %1, %2}"
+  : "%{nf%} {}\t{%2, %0|%0, %2}"; }
+  [(set_attr "isa" "*,apx_ndd")
+   (set_attr "type" "rotate")
+   (set (attr "length_immediate")
+ (if_then_else
+   (and (match_operand 2 "const1_operand")
+   (ior (match_test "TARGET_SHIFT1")
+(match_test "optimize_function_for_size_p (cfun)")))
+   (const_string "0")
+   (const_string "*")))
+   (set_attr "mode" "")])
+
 (define_insn "*3_1"
   [(set (match_operand:SWI12 0 "nonimmediate_operand" "=m,r")
(any_rotate:SWI12 (match_operand:SWI12 1 "nonimmediate_operand" "0,rm") 
diff --git a/gcc/testsuite/gcc.target/i386/apx-nf.c 
b/gcc/testsuite/gcc.target/i386/apx-nf.c
index 608dbf8f5f7..6e59803be64 100644
--- a/gcc/testsuite/gcc.target/i386/apx-nf.c
+++ b/gcc/testsuite/gcc.target/i386/apx-nf.c
@@ -3,6 +3,7 @@
 /* { dg-final { scan-assembler-times "\{nf\} add" 4 } } */
 /* { dg-final { scan-assembler-times "\{nf\} and" 1 } } */
 /* { dg-final { scan-assembler-times "\{nf\} or" 1 } } */
+/* { dg-final { scan-assembler-times "\{nf\} rol" 4 } } */
 
 #include "apx-ndd.c"
 
@@ -13,3 +14,7 @@ foo (struct B *b)
 {
 b->bit0 = b->bit0 | b->bit1;
 }
+long int f1 (int x) { return ~(1ULL << (x & 0x3f)); } long int f2 (int 
+x) { return ~(1ULL << x); } long int f3 (unsigned char *x) { return 
+~(1ULL << (x[0] & 0x3f)); } long int f4 (unsigned char *x) { return 
+~(1ULL << x[0]); }
--

[PATCH 4/8] [APX NF] Support APX NF for right shift insns

2024-05-15 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (*ashr3_1_nf): New.
(*lshr3_1_nf): Ditto.
(*lshrqi3_1_nf): Ditto.
(*lshrhi3_1_nf): Ditto.
---
 gcc/config/i386/i386.md | 85 +
 1 file changed, 85 insertions(+)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
9ffdb3fe71a..adcb09fcdd0 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -16569,6 +16569,21 @@
   [(set_attr "type" "ishiftx")
(set_attr "mode" "")])
 
+(define_insn "*ashr3_1_nf"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r")
+   (ashiftrt:SWI48
+ (match_operand:SWI48 1 "nonimmediate_operand" "0,rm,rm")
+ (match_operand:QI 2 "nonmemory_operand" "c,r,c")))]
+  "TARGET_APX_NF &&
+   ix86_binary_operator_ok (ASHIFTRT, mode, operands, TARGET_APX_NDD)"
+  "@
+   %{nf%} sar{}\t{%2, %0|%0, %2}
+   #
+   %{nf%} sar{}\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "*,bmi2,apx_ndd")
+   (set_attr "type" "ishift,ishiftx,ishift")
+   (set_attr "mode" "")])
+
 (define_insn "*ashr3_1"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r")
(ashiftrt:SWI48
@@ -16630,6 +16645,21 @@
 }
 [(set_attr "isa" "*,*,*,apx_ndd")])
 
+(define_insn "*lshr3_1_nf"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,?k,r")
+   (lshiftrt:SWI48
+ (match_operand:SWI48 1 "nonimmediate_operand" "0,rm,k,rm")
+ (match_operand:QI 2 "nonmemory_operand" "c,r,,c")))]
+  "TARGET_APX_NF &&
+  ix86_binary_operator_ok (LSHIFTRT, mode, operands, TARGET_APX_NDD)"
+  "@
+   %{nf%} shr{}\t{%2, %0|%0, %2}
+   #
+   #
+   %{nf%} shr{}\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "*,bmi2,avx512bw,apx_ndd")
+   (set_attr "type" "ishift,ishiftx,msklog,ishift")
+   (set_attr "mode" "")])
 
 (define_insn "*lshr3_1"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,?k,r") @@ 
-16669,6 +16699,17 @@
(set_attr "mode" "")])
 
 ;; Convert shift to the shiftx pattern to avoid flags dependency.
+;; For NF/NDD doesn't support shift count as r, it just support c, 
+;; but it has no flag.
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand")
+   (any_shiftrt:SWI48 (match_operand:SWI48 1 "nonimmediate_operand")
+  (match_operand:QI 2 "register_operand")))]
+  "TARGET_BMI2 && reload_completed"
+  [(set (match_dup 0)
+   (any_shiftrt:SWI48 (match_dup 1) (match_dup 2)))]
+  "operands[2] = gen_lowpart (mode, operands[2]);")
+
 (define_split
   [(set (match_operand:SWI48 0 "register_operand")
(any_shiftrt:SWI48 (match_operand:SWI48 1 "nonimmediate_operand") @@ 
-16737,6 +16778,20 @@
(zero_extend:DI (any_shiftrt:SI (match_dup 1) (match_dup 2]
   "operands[2] = gen_lowpart (SImode, operands[2]);")
 
+(define_insn "*ashr3_1_nf"
+  [(set (match_operand:SWI12 0 "nonimmediate_operand" "=m, r")
+   (ashiftrt:SWI12
+ (match_operand:SWI12 1 "nonimmediate_operand" "0, rm")
+ (match_operand:QI 2 "nonmemory_operand" "c, c")))]
+  "TARGET_APX_NF &&
+   ix86_binary_operator_ok (ASHIFTRT, mode, operands, TARGET_APX_NDD)"
+  "@
+   %{nf%} sar{}\t{%2, %0|%0, %2}
+   %{nf%} sar{}\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "*, apx_ndd")
+   (set_attr "type" "ishift")
+   (set_attr "mode" "")])
+
 (define_insn "*ashr3_1"
   [(set (match_operand:SWI12 0 "nonimmediate_operand" "=m, r")
(ashiftrt:SWI12
@@ -16765,6 +16820,21 @@
(const_string "*")))
(set_attr "mode" "")])
 
+(define_insn "*lshrqi3_1_nf"
+  [(set (match_operand:QI 0 "nonimmediate_operand"  "=qm,?k,r")
+   (lshiftrt:QI
+ (match_operand:QI 1 "nonimmediate_operand" "0,k,rm")
+ (match_operand:QI 2 "nonmemory_operand""cI,Wb,cI")))]
+  "TARGET_APX_NF &&
+   ix86_binary_operator_ok (LSHIFTRT, QImode, operands, TARGET_APX_NDD)"
+  "@
+   %{nf%} shr{b}\t{%2, %0|%0, %2}
+   #
+   %{nf%} shr{b}\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "*,avx512dq,apx_ndd")
+   (set_attr "type" "ishift,msklog,ishift")
+   (set_attr "mode" "QI")])
+
 (define_insn "*lshrqi3_1"
   [(set (match_operand:QI 0 "nonimmediate_operand"  "=qm,?k,r")
(lshiftrt:QI
@@ -16802,6 +16872,21 @@
(const_string "*")))
(set_attr "mode" "QI")])
 
+(define_insn "*lshrhi3_1_nf"
+  [(set (match_operand:HI 0 "nonimmediate_operand" "=rm,?k,r")
+   (lshiftrt:HI
+ (match_operand:HI 1 "nonimmediate_operand" "0,k,rm")
+ (match_operand:QI 2 "nonmemory_operand" "cI,Ww,cI")))]
+  "TARGET_APX_NF &&
+   ix86_binary_operator_ok (LSHIFTRT, HImode, operands, TARGET_APX_NDD)"
+  "@
+   %{nf%} shr{w}\t{%2, %0|%0, %2}
+   #
+   %{nf%} shr{w}\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "*, avx512f, apx_ndd")
+   (set_attr "type" "ishift,msklog,ishift")
+   (set_attr "mode" "HI")])
+
 (define_insn "*lshrhi3_1"
   [(set (match_operand:HI 0 "nonimmediate_operand" "=rm, ?k, r")
(lshiftrt:HI
--
2.31.1



[PATCH 3/8] [APX NF] Support APX NF for left shift insns

2024-05-15 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (*ashl3_1_nf): New.
(*ashlhi3_1_nf): Ditto.
(*ashlqi3_1_nf): Ditto.
* config/i386/sse.md: New define_split.
---
 gcc/config/i386/i386.md | 175 
 gcc/config/i386/sse.md  |  13 +++
 2 files changed, 188 insertions(+)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
66dc5e1035f..9ffdb3fe71a 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -15126,6 +15126,54 @@
   [(set_attr "type" "ishiftx")
(set_attr "mode" "")])
 
+(define_insn "*ashl3_1_nf"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r,?k,r")
+   (ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" 
"0,l,rm,k,rm")
+ (match_operand:QI 2 "nonmemory_operand" 
+"c,M,r,,c")))]
+  "TARGET_APX_NF &&
+  ix86_binary_operator_ok (ASHIFT, mode, operands, TARGET_APX_NDD)"
+{
+  bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
+  switch (get_attr_type (insn))
+{
+case TYPE_LEA:
+case TYPE_ISHIFTX:
+case TYPE_MSKLOG:
+  return "#";
+
+case TYPE_ALU:
+  gcc_assert (operands[2] == const1_rtx);
+  gcc_assert (rtx_equal_p (operands[0], operands[1]));
+  return "%{nf%} add{}\t%0, %0";
+
+default:
+  return use_ndd ? "%{nf%} sal{}\t{%2, %1, %0|%0, %1, %2}"
+: "%{nf%} sal{}\t{%2, %0|%0, %2}";
+}
+}
+  [(set_attr "isa" "*,*,bmi2,avx512bw,apx_ndd")
+   (set (attr "type")
+ (cond [(eq_attr "alternative" "1")
+ (const_string "lea")
+   (eq_attr "alternative" "2")
+ (const_string "ishiftx")
+   (eq_attr "alternative" "4")
+ (const_string "ishift")
+(and (and (match_test "TARGET_DOUBLE_WITH_ADD")
+ (match_operand 0 "register_operand"))
+(match_operand 2 "const1_operand"))
+ (const_string "alu")
+   (eq_attr "alternative" "3")
+ (const_string "msklog")
+  ]
+  (const_string "ishift")))
+   (set (attr "length_immediate")
+ (if_then_else
+   (eq_attr "type" "alu")
+   (const_string "0")
+   (const_string "*")))
+   (set_attr "mode" "")])
+
 (define_insn "*ashl3_1"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r,?k,r")
(ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" 
"0,l,rm,k,rm") @@ -15187,6 +15235,17 @@
(set_attr "mode" "")])
 
 ;; Convert shift to the shiftx pattern to avoid flags dependency.
+;; For NF/NDD doesn't support shift count as r, it just support c, 
+;; but it has no flag.
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand")
+   (ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand")
+ (match_operand:QI 2 "register_operand")))]
+  "TARGET_BMI2 && reload_completed"
+  [(set (match_dup 0)
+   (ashift:SWI48 (match_dup 1) (match_dup 2)))]
+  "operands[2] = gen_lowpart (mode, operands[2]);")
+
 (define_split
   [(set (match_operand:SWI48 0 "register_operand")
(ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand") @@ 
-15273,6 +15332,50 @@
(zero_extend:DI (ashift:SI (match_dup 1) (match_dup 2]
   "operands[2] = gen_lowpart (SImode, operands[2]);")
 
+(define_insn "*ashlhi3_1_nf"
+  [(set (match_operand:HI 0 "nonimmediate_operand" "=rm,Yp,?k,r")
+   (ashift:HI (match_operand:HI 1 "nonimmediate_operand" "0,l,k,rm")
+  (match_operand:QI 2 "nonmemory_operand" "cI,M,Ww,cI")))]
+  "TARGET_APX_NF &&
+   ix86_binary_operator_ok (ASHIFT, HImode, operands, TARGET_APX_NDD)"
+{
+  bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
+  switch (get_attr_type (insn))
+{
+case TYPE_LEA:
+case TYPE_MSKLOG:
+  return "#";
+
+case TYPE_ALU:
+  gcc_assert (operands[2] == const1_rtx);
+  return "%{nf%} add{w}\t%0, %0";
+
+default:
+  return use_ndd ? "%{nf%} sal{w}\t{%2, %1, %0|%0, %1, %2}"
+: "%{nf%} sal{w}\t{%2, %0|%0, %2}";
+}
+}
+  [(set_attr "isa" "*,*,avx512f,apx_ndd")
+   (set (attr "type")
+ (cond [(eq_attr "alternative" "1")
+ (const_string "lea")
+   (eq_attr "alternative" "2")
+ (const_string "msklog")
+   (eq_attr "alternative" "3")
+ (const_string "ishift")
+(and (and (match_test "TARGET_DOUBLE_WITH_ADD")
+ (match_operand 0 "register_operand"))
+(match_operand 2 "const1_operand"))
+ (const_string "alu")
+  ]
+  (const_string "ishift")))
+   (set (attr "length_immediate")
+ (if_then_else
+   (eq_attr "type" "alu")
+   (const_string "0")
+   (const_string "*")))
+   (set_attr "mode" "HI,SI,HI,HI")])
+
 (define_insn "*ashlhi3_1"
   [(set (match_operand:HI 0 "nonimmediate_operand" "=rm,Yp,?k,r")
(ashift:HI (match_operand:HI 1 "nonimmediate_operand" "0,l,k,rm") @@ 
-15326,6 +15429,61 @@
(const_string 

[PATCH 2/8] [APX NF] Support APX NF for {sub/and/or/xor/neg}

2024-05-15 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (*sub_1_nf): New define_insn.
(*anddi_1_nf): Ditto.
(*and_1_nf): Ditto.
(*qi_1_nf): Ditto.
(*_1_nf): Ditto.
(*neg_1_nf): Ditto.
* config/i386/sse.md : New define_split.

gcc/testsuite/ChangeLog:

* gcc.target/i386/apx-nf.c: Add test.
---
 gcc/config/i386/i386.md| 129 +
 gcc/config/i386/sse.md |  11 +++
 gcc/testsuite/gcc.target/i386/apx-nf.c |   9 ++
 3 files changed, 149 insertions(+)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
4a9e35c4990..66dc5e1035f 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -7888,6 +7888,24 @@
   "split_double_mode (mode, [0], 2, [0], [3]);"
 [(set_attr "isa" "*,*,apx_ndd,apx_ndd")])
 
+(define_insn "*sub_1_nf"
+  [(set (match_operand:SWI 0 "nonimmediate_operand" "=m,rjM,,r,r,r")
+   (minus:SWI
+ (match_operand:SWI 1 "nonimmediate_operand" "0,0,0,rm,rjM,r")
+ (match_operand:SWI 2 "" ",,,r,,")))]
+  "TARGET_APX_NF &&
+   ix86_binary_operator_ok (MINUS, mode, operands, TARGET_APX_NDD)"
+  "@
+  %{nf%} sub{}\t{%2, %0|%0, %2}
+  %{nf%} sub{}\t{%2, %0|%0, %2}
+  %{nf%} sub{}\t{%2, %0|%0, %2}
+  %{nf%} sub{}\t{%2, %1, %0|%0, %1, %2}
+  %{nf%} sub{}\t{%2, %1, %0|%0, %1, %2}
+  %{nf%} sub{}\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "*,*,*,apx_ndd,apx_ndd,apx_ndd")
+   (set_attr "type" "alu")
+   (set_attr "mode" "")])
+
 (define_insn "*sub_1"
   [(set (match_operand:SWI 0 "nonimmediate_operand" "=m,,r,r,r")
(minus:SWI
@@ -11790,6 +11808,27 @@
 }
 [(set_attr "isa" "*,*,apx_ndd,apx_ndd,apx_ndd,apx_ndd_64,apx_ndd")])
 
+(define_insn "*anddi_1_nf"
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,r,rm,rjM,r,r,r,r,?k")
+   (and:DI
+(match_operand:DI 1 "nonimmediate_operand" "%0,r,0,0,0,rm,rjM,r,k")
+(match_operand:DI 2 "x86_64_szext_general_operand" 
+"Z,Z,r,e,m,r,e,m,k")))]
+  "TARGET_APX_NF
+   && ix86_binary_operator_ok (AND, DImode, operands, TARGET_APX_NDD)"
+  "@
+   %{nf%} and{l}\t{%k2, %k0|%k0, %k2}
+   %{nf%} and{l}\t{%k2, %k1, %k0|%k0, %k1, %k2}
+   %{nf%} and{q}\t{%2, %0|%0, %2}
+   %{nf%} and{q}\t{%2, %0|%0, %2}
+   %{nf%} and{q}\t{%2, %0|%0, %2}
+   %{nf%} and{q}\t{%2, %1, %0|%0, %1, %2}
+   %{nf%} and{q}\t{%2, %1, %0|%0, %1, %2}
+   %{nf%} and{q}\t{%2, %1, %0|%0, %1, %2}
+   #"
+  [(set_attr "isa" "*,apx_ndd,*,*,*,apx_ndd,apx_ndd,apx_ndd,avx512bw")
+   (set_attr "type" "alu,alu,alu,alu,alu,alu,alu,alu,msklog")
+   (set_attr "mode" "SI,SI,DI,DI,DI,DI,DI,DI,DI")])
+
 (define_insn "*anddi_1"
   [(set (match_operand:DI 0 "nonimmediate_operand" "=r,r,rm,r,r,r,r,r,?k")
(and:DI
@@ -11889,6 +11928,33 @@
(set_attr "isa" "*,apx_ndd,apx_ndd,apx_ndd")
(set_attr "mode" "SI")])
 
+(define_insn "*and_1_nf"
+  [(set (match_operand:SWI24 0 "nonimmediate_operand" "=rm,rjM,r,r,r,r,?k")
+   (and:SWI24 (match_operand:SWI24 1 "nonimmediate_operand" 
"%0,0,0,rm,rjM,r,k")
+  (match_operand:SWI24 2 "" 
+"r,,,r,,,k")))]
+  "TARGET_APX_NF &&
+   ix86_binary_operator_ok (AND, mode, operands, TARGET_APX_NDD)"
+  "@
+   %{nf%} and{}\t{%2, %0|%0, %2}
+   %{nf%} and{}\t{%2, %0|%0, %2}
+   %{nf%} and{}\t{%2, %0|%0, %2}
+   %{nf%} and{}\t{%2, %1, %0|%0, %1, %2}
+   %{nf%} and{}\t{%2, %1, %0|%0, %1, %2}
+   %{nf%} and{}\t{%2, %1, %0|%0, %1, %2}
+   #"
+  [(set (attr "isa")
+   (cond [(eq_attr "alternative" "3,4,5")
+(const_string "apx_ndd")
+  (eq_attr "alternative" "6")
+(if_then_else (eq_attr "mode" "SI")
+  (const_string "avx512bw")
+  (const_string "avx512f"))
+ ]
+ (const_string "*")))
+   (set_attr "type" "alu,alu,alu,alu,alu,alu,msklog")
+   (set_attr "type" "alu")
+   (set_attr "mode" "")])
+
 (define_insn "*and_1"
   [(set (match_operand:SWI24 0 "nonimmediate_operand" "=rm,r,r,r,r,Ya,?k")
(and:SWI24 (match_operand:SWI24 1 "nonimmediate_operand" 
"%0,0,rm,rjM,r,qm,k") @@ -11923,6 +11989,37 @@
(const_string "*")))
(set_attr "mode" ",SI,")])
 
+;; NF for and,or,xor
+
+(define_insn "*qi_1_nf"
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,r,r,?k")
+   (any_logic:QI (match_operand:QI 1 "nonimmediate_operand" 
"%0,0,0,rm,r,k")
+  (match_operand:QI 2 "general_operand" "qn,m,rn,rn,m,k")))]
+  "TARGET_APX_NF &&
+   ix86_binary_operator_ok (, QImode, operands, TARGET_APX_NDD)"
+  "@
+   %{nf%} {b}\t{%2, %0|%0, %2}
+   %{nf%} {b}\t{%2, %0|%0, %2}
+   %{nf%} {l}\t{%k2, %k0|%k0, %k2}
+   %{nf%} {b}\t{%2, %1, %0|%0, %1, %2}
+   %{nf%} {b}\t{%2, %1, %0|%0, %1, %2}
+   #"
+  [(set_attr "isa" "*,*,*,apx_ndd,apx_ndd,avx512f")
+   (set_attr "type" "alu,alu,alu,alu,alu,msklog")
+   (set (attr "mode")
+   (cond [(eq_attr "alternative" "2")
+(const_string "SI")
+   (and (eq_attr "alternative" "5")
+(match_test 

[PATCH 1/8] [APX NF]: Support APX NF add

2024-05-15 Thread Kong, Lingling
From: Hongyu Wang 

APX NF(no flags) feature implements suppresses the update of status flags for 
arithmetic operations.

For NF add, it is not clear whether NF add can be faster than lea. If so, the 
pattern needs to be adjusted to prefer LEA generation.

gcc/ChangeLog:

* config/i386/i386-opts.h (enum apx_features): Add nf
enumeration.
* config/i386/i386.h (TARGET_APX_NF): New.
* config/i386/i386.md (*add_1_nf): New define_insn.
* config/i386/i386.opt: Add apx_nf enumeration.

gcc/testsuite/ChangeLog:

* gcc.target/i386/apx-ndd.c: Fixed test.
* gcc.target/i386/apx-nf.c: New test.

Co-authored-by: Lingling Kong 

Bootstrapped and regtested on x86_64-linux-gnu. And Supported SPEC 2017 run 
normally on Intel software development emulator.
Ok for trunk?

---
 gcc/config/i386/i386-opts.h |  3 +-
 gcc/config/i386/i386.h  |  1 +
 gcc/config/i386/i386.md | 42 +
 gcc/config/i386/i386.opt|  3 ++
 gcc/testsuite/gcc.target/i386/apx-ndd.c |  2 +-
 gcc/testsuite/gcc.target/i386/apx-nf.c  |  6 
 6 files changed, 55 insertions(+), 2 deletions(-)  create mode 100644 
gcc/testsuite/gcc.target/i386/apx-nf.c

diff --git a/gcc/config/i386/i386-opts.h b/gcc/config/i386/i386-opts.h index 
ef2825803b3..60176ce609f 100644
--- a/gcc/config/i386/i386-opts.h
+++ b/gcc/config/i386/i386-opts.h
@@ -140,7 +140,8 @@ enum apx_features {
   apx_push2pop2 = 1 << 1,
   apx_ndd = 1 << 2,
   apx_ppx = 1 << 3,
-  apx_all = apx_egpr | apx_push2pop2 | apx_ndd | apx_ppx,
+  apx_nf = 1<< 4,
+  apx_all = apx_egpr | apx_push2pop2 | apx_ndd | apx_ppx | apx_nf,
 };
 
 #endif
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 
529edff93a4..f20ae4726da 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -55,6 +55,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If 
not, see  #define TARGET_APX_PUSH2POP2 (ix86_apx_features & apx_push2pop2)  
#define TARGET_APX_NDD (ix86_apx_features & apx_ndd)  #define TARGET_APX_PPX 
(ix86_apx_features & apx_ppx)
+#define TARGET_APX_NF (ix86_apx_features & apx_nf)
 
 #include "config/vxworks-dummy.h"
 
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
764bfe20ff2..4a9e35c4990 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -6233,6 +6233,48 @@
 }
 })
 

+;; NF instructions.
+
+(define_insn "*add_1_nf"
+  [(set (match_operand:SWI 0 "nonimmediate_operand" "=rm,rje,r,r,r,r,r,r")
+   (plus:SWI
+ (match_operand:SWI 1 "nonimmediate_operand" "%0,0,0,r,r,rje,jM,r")
+ (match_operand:SWI 2 "x86_64_general_operand" 
+"r,e,BM,0,le,r,e,BM")))]
+  "TARGET_APX_NF &&
+   ix86_binary_operator_ok (PLUS, mode, operands,
+   TARGET_APX_NDD)"
+{
+  bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
+  if (which_alternative == 3)
+  std::swap (operands[1], operands[2]);
+
+  if (operands[2] == const1_rtx)
+return use_ndd
+ ? "%{nf%} inc{}\t{%1, %0|%0, %1}"
+ : "%{nf%} inc{}\t{%0|%0}";
+
+  if (operands[2] == constm1_rtx)
+return use_ndd
+ ? "%{nf%} dec{}\t{%1, %0|%0, %1}"
+ : "%{nf%} dec{}\t{%0|%0}";
+
+  return use_ndd
+? "%{nf%} add{}\t{%2, %1, %0|%0, %1, %2}"
+: "%{nf%} add{}\t{%2, %0|%0, %2}"; }
+  [(set_attr "isa" "*,*,*,*,apx_ndd,apx_ndd,apx_ndd,apx_ndd")
+   (set (attr "type")
+ (cond [(eq_attr "alternative" "4")
+  (const_string "lea")
+  ]
+  (const_string "alu")))
+   (set (attr "length_immediate")
+  (if_then_else
+   (and (eq_attr "type" "alu") (match_operand 2 "const128_operand"))
+   (const_string "1")
+   (const_string "*")))
+   (set_attr "mode" "")])
+
 ;; Load effective address instructions
 
 (define_insn "*lea"
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt index 
d5f793a9e8b..66021d59d4e 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1356,6 +1356,9 @@ Enum(apx_features) String(ndd) Value(apx_ndd) Set(4)  
EnumValue
 Enum(apx_features) String(ppx) Value(apx_ppx) Set(5)
 
+EnumValue
+Enum(apx_features) String(nf) Value(apx_nf) Set(6)
+
 EnumValue
 Enum(apx_features) String(all) Value(apx_all) Set(1)
 
diff --git a/gcc/testsuite/gcc.target/i386/apx-ndd.c 
b/gcc/testsuite/gcc.target/i386/apx-ndd.c
index 0eb751ad225..0ff4df0780c 100644
--- a/gcc/testsuite/gcc.target/i386/apx-ndd.c
+++ b/gcc/testsuite/gcc.target/i386/apx-ndd.c
@@ -1,5 +1,5 @@
 /* { dg-do compile { target { ! ia32 } } } */
-/* { dg-options "-mapxf -march=x86-64 -O2" } */
+/* { dg-options "-mapx-features=egpr,push2pop2,ndd,ppx -march=x86-64 
+-O2" } */
 /* { dg-final { scan-assembler-not "movl"} } */
 
 #include 
diff --git a/gcc/testsuite/gcc.target/i386/apx-nf.c 
b/gcc/testsuite/gcc.target/i386/apx-nf.c
new file mode 100644
index 000..3adc7a27902
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/apx-nf.c
@@ -0,0 +1,6 @@
+/* { 

[PATCH] i386: fix ix86_hardreg_mov_ok with lra_in_progress

2024-05-06 Thread Kong, Lingling
Hi,
Originally eliminate_regs_in_insn will transform 
(parallel [
  (set (reg:QI 130)
(plus:QI (subreg:QI (reg:DI 19 frame) 0)
  (const_int 96)))
  (clobber (reg:CC 17 flag))]) {*addqi_1} 
to 
(set (reg:QI 130) 
  (subreg:QI (reg:DI 19 frame) 0)) {*movqi_internal}
when verify_changes.

But with No Flags add, it transforms
(set (reg:QI 5 di)
  (plus:QI (subreg:QI (reg:DI 19 frame) 0)
   (const_int 96))) {*addqi_1_nf}
to
(set (reg:QI 5 di)
 (subreg:QI (reg:DI 19 frame) 0)) {*addqi_1_nf}.
there is no extra clobbers at the end, and its dest reg just is a hardreg. For 
ix86_hardreg_mov_ok, it returns false. So it fails to update insn and causes 
the ICE when transform to movqi_internal.

But actually it is ok and safe for ix86_hardreg_mov_ok when lra_in_progress.

And tested the spec2017, the performance was not affected.
Bootstrapped and regtested on x86_64-pc-linux-gnu. OK for trunk?

gcc/ChangeLog:

* config/i386/i386.cc (ix86_hardreg_mov_ok): Relax
hard reg mov restriction when lra in progress.
---
 gcc/config/i386/i386.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 
4d6b2b98761..ca4348a18bf 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -20357,7 +20357,8 @@ ix86_hardreg_mov_ok (rtx dst, rtx src)
   ? standard_sse_constant_p (src, GET_MODE (dst))
   : x86_64_immediate_operand (src, GET_MODE (dst)))
   && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst)))
-  && !reload_completed)
+  && !reload_completed
+  && !lra_in_progress)
 return false;
   return true;
 }
--
2.31.1



[PATCH] x86: Fix cmov cost model issue [PR109549]

2024-05-05 Thread Kong, Lingling
Hi,
(if_then_else:SI (eq (reg:CCZ 17 flags)
(const_int 0 [0]))
(reg/v:SI 101 [ e ])
(reg:SI 102))
The cost is 8 for the rtx, the cost for
(eq (reg:CCZ 17 flags) (const_int 0 [0])) is 4, but this is just an operator do 
not need to compute it's cost in cmov.

Bootstrapped and regtested on x86_64-pc-linux-gnu.
OK for trunk?

gcc/ChangeLog:

PR target/109549
* config/i386/i386.cc (ix86_rtx_costs): The XEXP (x, 0) for cmov
is an operator do not need to compute cost.

gcc/testsuite/ChangeLog:

* gcc.target/i386/cmov6.c: Fixed.
---
 gcc/config/i386/i386.cc   | 2 +-
 gcc/testsuite/gcc.target/i386/cmov6.c | 5 +
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 
4d6b2b98761..59b4ce3bfbf 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -22237,7 +22237,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
{
  /* cmov.  */
  *total = COSTS_N_INSNS (1);
- if (!REG_P (XEXP (x, 0)))
+ if (!COMPARISON_P (XEXP (x, 0)) && !REG_P (XEXP (x, 0)))
*total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
  if (!REG_P (XEXP (x, 1)))
*total += rtx_cost (XEXP (x, 1), mode, code, 1, speed); diff --git 
a/gcc/testsuite/gcc.target/i386/cmov6.c b/gcc/testsuite/gcc.target/i386/cmov6.c
index 5111c8a9099..535326e4c2a 100644
--- a/gcc/testsuite/gcc.target/i386/cmov6.c
+++ b/gcc/testsuite/gcc.target/i386/cmov6.c
@@ -1,9 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-O2 -march=k8" } */
-/* if-converting this sequence would require two cmov
-   instructions and seems to always cost more independent
-   of the TUNE_ONE_IF_CONV setting.  */
-/* { dg-final { scan-assembler-not "cmov\[^6\]" } } */
+/* { dg-final { scan-assembler "cmov\[^6\]" } } */
 
 /* Verify that blocks are converted to conditional moves.  */  extern int bar 
(int, int);
--
2.31.1



RE: [PATCH] i386: Prefer remote atomic insn for atomic_fetch{add, and, or, xor}

2022-11-07 Thread Kong, Lingling via Gcc-patches
> On Sun, Nov 6, 2022 at 2:00 PM Kong, Lingling via Gcc-patches  patc...@gcc.gnu.org> wrote:
> >
> > Hi
> >
> > The patch is to add flag -mprefer-remote-atomic to control whether to
> generate raoint insn for atomic operations.
> > Ok for trunk?
> 
> Please note TARGET_AVOID_MFENCE tuning flag, introduced a while ago due to
> the fact that several targets perform LOCK OR faster than MFENCE.
> 
> It was determined that MFENCE/SFENCE/LFENCE are much more complex
> instructions compared to LOCK OR, since they have to handle cases that C
> memory model never describes (some MMIO, or such). Considering that
> ordinary LOCKed operations adequately cover C memory model, and are
> probably faster than new instructions that have to cover all special cases, I
> wonder if there is really benefit to emit these insns instead of existing 
> LOCKed
> operations. These should IMO be used only via relevant builtins.
> 
> Uros.
> 

Ok, I will revert this patch in trunk. 
And wait until the optimization results of the actual hardware come out, and 
then consider to push the optimization patch.

> >
> > BRs,
> > Lingling
> >
> > gcc/ChangeLog:
> >
> > * config/i386/i386.opt:Add -mprefer-remote-atomic.
> > * config/i386/sync.md (atomic_):
> > New define_expand.
> > (atomic_add): Rename to below one.
> > (atomic_add_1): To this.
> > (atomic_): Ditto.
> > (atomic__1): Ditto.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/i386/raoint-atomic-fetch.c: New test.
> > ---
> >  gcc/config/i386/i386.opt  |  4 +++
> >  gcc/config/i386/sync.md   | 29 ---
> >  .../gcc.target/i386/raoint-atomic-fetch.c | 29 +++
> >  3 files changed, 58 insertions(+), 4 deletions(-)  create mode 100644
> > gcc/testsuite/gcc.target/i386/raoint-atomic-fetch.c
> >
> > diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt index
> > 415c52e1bb4..abb1e5ecbdc 100644
> > --- a/gcc/config/i386/i386.opt
> > +++ b/gcc/config/i386/i386.opt
> > @@ -1246,3 +1246,7 @@ Support PREFETCHI built-in functions and code
> generation.
> >  mraoint
> >  Target Mask(ISA2_RAOINT) Var(ix86_isa_flags2) Save  Support RAOINT built-in
> functions and code generation.
> > +
> > +mprefer-remote-atomic
> > +Target Var(flag_prefer_remote_atomic) Init(0) Prefer use remote
> > +atomic insn for atomic operations.
> > diff --git a/gcc/config/i386/sync.md b/gcc/config/i386/sync.md index
> > e6543a5efb0..08e944fc9b7 100644
> > --- a/gcc/config/i386/sync.md
> > +++ b/gcc/config/i386/sync.md
> > @@ -37,7 +37,7 @@
> >UNSPECV_CMPXCHG
> >UNSPECV_XCHG
> >UNSPECV_LOCK
> > -
> > +
> >;; For CMPccXADD support
> >UNSPECV_CMPCCXADD
> >
> > @@ -791,7 +791,28 @@
> >  (define_code_iterator any_plus_logic [and ior xor plus])
> > (define_code_attr plus_logic [(and "and") (ior "or") (xor "xor") (plus
> > "add")])
> >
> > -(define_insn "rao_a"
> > +(define_expand "atomic_"
> > +  [(match_operand:SWI 0 "memory_operand")
> > +   (any_plus_logic:SWI (match_dup 0)
> > +  (match_operand:SWI 1 "nonmemory_operand"))
> > +   (match_operand:SI 2 "const_int_operand")]
> > +  ""
> > +{
> > +  if (flag_prefer_remote_atomic
> > +  && TARGET_RAOINT && operands[2] == const0_rtx
> > +  && (mode == SImode || mode == DImode))
> > +  {
> > +if (CONST_INT_P (operands[1]))
> > +  operands[1] = force_reg (mode, operands[1]);
> > +emit_insn (maybe_gen_rao_a (, mode, operands[0],
> > +operands[1]));
> > +  }
> > +  else
> > +emit_insn (gen_atomic__1 (operands[0], operands[1],
> > +   operands[2]));
> > +  DONE;
> > +})
> > +
> > +(define_insn "@rao_a"
> >[(set (match_operand:SWI48 0 "memory_operand" "+m")
> > (unspec_volatile:SWI48
> >   [(any_plus_logic:SWI48 (match_dup 0) @@ -801,7 +822,7 @@
> >"TARGET_RAOINT"
> >"a\t{%1, %0|%0, %1}")
> >
> > -(define_insn "atomic_add"
> > +(define_insn "atomic_add_1"
> >[(set (match_operand:SWI 0 "memory_operand" "+m")
> > (unspec_volatile:SWI
> >   [(plu

[PATCH] [committed] i386: Fix typo in sse-22.c pragma

2022-11-07 Thread Kong, Lingling via Gcc-patches


gcc/testsuite/ChangeLog:

* gcc.target/i386/sse-22.c: Fix typo in pragma GCC target.

Pushing as obvious.

Thanks,
Lingling
---
 gcc/testsuite/gcc.target/i386/sse-22.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/i386/sse-22.c 
b/gcc/testsuite/gcc.target/i386/sse-22.c
index f5808e4513b..f600bb544b2 100644
--- a/gcc/testsuite/gcc.target/i386/sse-22.c
+++ b/gcc/testsuite/gcc.target/i386/sse-22.c
@@ -103,7 +103,7 @@
 
 
 #ifndef DIFFERENT_PRAGMAS
-#pragma GCC target 
("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,avx512vl,avx512bw,avx512dq,avx512vbmi,avx512vbmi2,avx512ifma,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,gfni,avx512bitalg,avx512bf16,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avx512fp16,avxifma,avxvnniint8,avxneconvert,amx-fp16.raoint")
+#pragma GCC target 
("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,avx512vl,avx512bw,avx512dq,avx512vbmi,avx512vbmi2,avx512ifma,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,gfni,avx512bitalg,avx512bf16,avx512vp2intersect,serialize,tsxldtrk,amx-tile,amx-int8,amx-bf16,kl,widekl,avxvnni,avx512fp16,avxifma,avxvnniint8,avxneconvert,amx-fp16,raoint")
 #endif
 
 /* Following intrinsics require immediate arguments.  They
-- 
2.27.0



[PATCH] i386: Prefer remote atomic insn for atomic_fetch{add, and, or, xor}

2022-11-06 Thread Kong, Lingling via Gcc-patches
Hi

The patch is to add flag -mprefer-remote-atomic to control whether to generate 
raoint insn for atomic operations.
Ok for trunk?

BRs,
Lingling

gcc/ChangeLog:

* config/i386/i386.opt:Add -mprefer-remote-atomic.
* config/i386/sync.md (atomic_):
New define_expand.
(atomic_add): Rename to below one.
(atomic_add_1): To this.
(atomic_): Ditto.
(atomic__1): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/raoint-atomic-fetch.c: New test.
---
 gcc/config/i386/i386.opt  |  4 +++
 gcc/config/i386/sync.md   | 29 ---
 .../gcc.target/i386/raoint-atomic-fetch.c | 29 +++
 3 files changed, 58 insertions(+), 4 deletions(-)  create mode 100644 
gcc/testsuite/gcc.target/i386/raoint-atomic-fetch.c

diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt index 
415c52e1bb4..abb1e5ecbdc 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1246,3 +1246,7 @@ Support PREFETCHI built-in functions and code generation.
 mraoint
 Target Mask(ISA2_RAOINT) Var(ix86_isa_flags2) Save  Support RAOINT built-in 
functions and code generation.
+
+mprefer-remote-atomic
+Target Var(flag_prefer_remote_atomic) Init(0) Prefer use remote atomic 
+insn for atomic operations.
diff --git a/gcc/config/i386/sync.md b/gcc/config/i386/sync.md index 
e6543a5efb0..08e944fc9b7 100644
--- a/gcc/config/i386/sync.md
+++ b/gcc/config/i386/sync.md
@@ -37,7 +37,7 @@
   UNSPECV_CMPXCHG
   UNSPECV_XCHG
   UNSPECV_LOCK
- 
+
   ;; For CMPccXADD support
   UNSPECV_CMPCCXADD
 
@@ -791,7 +791,28 @@
 (define_code_iterator any_plus_logic [and ior xor plus])  (define_code_attr 
plus_logic [(and "and") (ior "or") (xor "xor") (plus "add")])
 
-(define_insn "rao_a"
+(define_expand "atomic_"
+  [(match_operand:SWI 0 "memory_operand")
+   (any_plus_logic:SWI (match_dup 0)
+  (match_operand:SWI 1 "nonmemory_operand"))
+   (match_operand:SI 2 "const_int_operand")]
+  ""
+{
+  if (flag_prefer_remote_atomic
+  && TARGET_RAOINT && operands[2] == const0_rtx
+  && (mode == SImode || mode == DImode))
+  {
+if (CONST_INT_P (operands[1]))
+  operands[1] = force_reg (mode, operands[1]);
+emit_insn (maybe_gen_rao_a (, mode, operands[0], 
+operands[1]));
+  }
+  else
+emit_insn (gen_atomic__1 (operands[0], operands[1],
+   operands[2]));
+  DONE;
+})
+
+(define_insn "@rao_a"
   [(set (match_operand:SWI48 0 "memory_operand" "+m")
(unspec_volatile:SWI48
  [(any_plus_logic:SWI48 (match_dup 0) @@ -801,7 +822,7 @@
   "TARGET_RAOINT"
   "a\t{%1, %0|%0, %1}")
 
-(define_insn "atomic_add"
+(define_insn "atomic_add_1"
   [(set (match_operand:SWI 0 "memory_operand" "+m")
(unspec_volatile:SWI
  [(plus:SWI (match_dup 0)
@@ -855,7 +876,7 @@
   return "lock{%;} %K2sub{}\t{%1, %0|%0, %1}";
 })
 
-(define_insn "atomic_"
+(define_insn "atomic__1"
   [(set (match_operand:SWI 0 "memory_operand" "+m")
(unspec_volatile:SWI
  [(any_logic:SWI (match_dup 0)
diff --git a/gcc/testsuite/gcc.target/i386/raoint-atomic-fetch.c 
b/gcc/testsuite/gcc.target/i386/raoint-atomic-fetch.c
new file mode 100644
index 000..ac4099d888e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/raoint-atomic-fetch.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-mraoint -O2 -mprefer-remote-atomic" } */
+/* { dg-final { scan-assembler-times "aadd" 2 { target {! ia32 } } } } 
+*/
+/* { dg-final { scan-assembler-times "aand" 2 { target {! ia32 } } } } 
+*/
+/* { dg-final { scan-assembler-times "aor" 2 { target {! ia32 } } } } 
+*/
+/* { dg-final { scan-assembler-times "axor" 2 { target {! ia32 } } } } 
+*/
+/* { dg-final { scan-assembler-times "aadd" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "aand" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "aor" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "axor" 1 { target ia32 } } } */ 
+volatile int x; volatile long long y; int *a; long long *b;
+
+void extern
+rao_int_test (void)
+{
+  __atomic_add_fetch (a, x, __ATOMIC_RELAXED);
+  __atomic_and_fetch (a, x, __ATOMIC_RELAXED);
+  __atomic_or_fetch (a, x, __ATOMIC_RELAXED);
+  __atomic_xor_fetch (a, x, __ATOMIC_RELAXED); #ifdef __x86_64__
+  __atomic_add_fetch (b, y, __ATOMIC_RELAXED);
+  __atomic_and_fetch (b, y, __ATOMIC_RELAXED);
+  __atomic_or_fetch (b, y, __ATOMIC_RELAXED);
+  __atomic_xor_fetch (b, y, __ATOMIC_RELAXED); #endif }
--
2.27.0



[PATCH] Support Intel RAO-INT

2022-11-06 Thread Kong, Lingling via Gcc-patches
Hi,
The patches aimed to add Intel RAO-INT.

The information is based on newly released
Intel Architecture Instruction Set Extensions and Future Features.

The document comes following:
https://www.intel.com/content/www/us/en/develop/download/intel-architecture-instruction-set-extensions-programming-reference.html.

OK for trunk?

gcc/ChangeLog:

* common/config/i386/cpuinfo.h (get_available_features):
Detect raoint.
* common/config/i386/i386-common.cc (OPTION_MASK_ISA2_RAOINT_SET,
OPTION_MASK_ISA2_RAOINT_UNSET): New.
(ix86_handle_option): Handle -mraoint.
* common/config/i386/i386-cpuinfo.h (enum processor_features):
Add FEATURE_RAOINT.
* common/config/i386/i386-isas.h: Add ISA_NAME_TABLE_ENTRY for
raoint.
* config.gcc: Add raointintrin.h
* config/i386/cpuid.h (bit_RAOINT): New.
* config/i386/i386-builtin.def (BDESC): Add new builtins.
* config/i386/i386-c.cc (ix86_target_macros_internal): Define
__RAOINT__.
* config/i386/i386-isa.def (RAOINT): Add DEF_PTA(RAOINT).
* config/i386/i386-options.cc (ix86_valid_target_attribute_inner_p):
Add -mraoint.
* config/i386/sync.md (rao_a): New define insn.
* config/i386/i386.opt: Add option -mraoint.
* config/i386/x86gprintrin.h: Include raointintrin.h.
* doc/extend.texi: Document raoint.
* doc/invoke.texi: Document -mraoint.
* doc/sourcebuild.texi: Document target raoint.
* config/i386/raointintrin.h: New file.

gcc/testsuite/ChangeLog:

* g++.dg/other/i386-2.C: Add -mraoint.
* g++.dg/other/i386-3.C: Ditto.
* gcc.target/i386/funcspec-56.inc: Add new target attribute.
* gcc.target/i386/sse-12.c: Add -mraoint.
* gcc.target/i386/sse-13.c: Ditto.
* gcc.target/i386/sse-14.c: Ditto.
* gcc.target/i386/sse-22.c: Add raoint target.
* gcc.target/i386/sse-23.c: Ditto.
* lib/target-supports.exp: Add check_effective_target_raoint.
* gcc.target/i386/rao-helper.h: New test.
* gcc.target/i386/raoint-1.c: Ditto.
* gcc.target/i386/raoint-aadd-2.c: Ditto.
* gcc.target/i386/raoint-aand-2.c: Ditto.
* gcc.target/i386/raoint-aor-2.c: Ditto.
* gcc.target/i386/raoint-axor-2.c: Ditto.
* gcc.target/i386/x86gprintrin-1.c: Ditto.
* gcc.target/i386/x86gprintrin-2.c: Ditto.
* gcc.target/i386/x86gprintrin-3.c: Ditto.
* gcc.target/i386/x86gprintrin-4.c: Ditto.
* gcc.target/i386/x86gprintrin-5.c: Ditto.
---
 gcc/common/config/i386/cpuinfo.h  |   2 +
 gcc/common/config/i386/i386-common.cc |  15 +++
 gcc/common/config/i386/i386-cpuinfo.h |   1 +
 gcc/common/config/i386/i386-isas.h|   1 +
 gcc/config.gcc|   3 +-
 gcc/config/i386/cpuid.h   |   1 +
 gcc/config/i386/i386-builtin.def  |  10 ++
 gcc/config/i386/i386-c.cc |   2 +
 gcc/config/i386/i386-isa.def  |   1 +
 gcc/config/i386/i386-options.cc   |   4 +-
 gcc/config/i386/i386.opt  |   4 +
 gcc/config/i386/raointintrin.h| 101 ++
 gcc/config/i386/sync.md   |  16 +++
 gcc/config/i386/x86gprintrin.h|   2 +
 gcc/doc/extend.texi   |   5 +
 gcc/doc/invoke.texi   |  11 +-
 gcc/doc/sourcebuild.texi  |   3 +
 gcc/testsuite/g++.dg/other/i386-2.C   |   2 +-
 gcc/testsuite/g++.dg/other/i386-3.C   |   2 +-
 gcc/testsuite/gcc.target/i386/funcspec-56.inc |   2 +
 gcc/testsuite/gcc.target/i386/rao-helper.h|  79 ++
 gcc/testsuite/gcc.target/i386/raoint-1.c  |  31 ++
 gcc/testsuite/gcc.target/i386/raoint-aadd-2.c |  24 +  
gcc/testsuite/gcc.target/i386/raoint-aand-2.c |  25 +  
gcc/testsuite/gcc.target/i386/raoint-aor-2.c  |  25 +  
gcc/testsuite/gcc.target/i386/raoint-axor-2.c |  25 +
 gcc/testsuite/gcc.target/i386/sse-12.c|   2 +-
 gcc/testsuite/gcc.target/i386/sse-13.c|   2 +-
 gcc/testsuite/gcc.target/i386/sse-14.c|   2 +-
 gcc/testsuite/gcc.target/i386/sse-22.c|   4 +-
 gcc/testsuite/gcc.target/i386/sse-23.c|   2 +-
 .../gcc.target/i386/x86gprintrin-1.c  |   2 +-
 .../gcc.target/i386/x86gprintrin-2.c  |   2 +-
 .../gcc.target/i386/x86gprintrin-3.c  |   2 +-
 .../gcc.target/i386/x86gprintrin-4.c  |   4 +-
 .../gcc.target/i386/x86gprintrin-5.c  |   4 +-
 gcc/testsuite/lib/target-supports.exp |  11 ++
 37 files changed, 413 insertions(+), 21 deletions(-)  create mode 100644 
gcc/config/i386/raointintrin.h  create mode 100644 
gcc/testsuite/gcc.target/i386/rao-helper.h
 create mode 100644 gcc/testsuite/gcc.target/i386/raoint-1.c
 create mode 100644 

RE: [wwwdocs] [GCC13] Mention Intel __bf16 support in AVX512BF16 intrinsics.

2022-11-03 Thread Kong, Lingling via Gcc-patches
> > > diff --git a/htdocs/gcc-13/changes.html b/htdocs/gcc-13/changes.html
> > > index 7c6bfa6e..cd0282f1 100644
> > > --- a/htdocs/gcc-13/changes.html
> > > +++ b/htdocs/gcc-13/changes.html
> > > @@ -230,6 +230,8 @@ a work-in-progress.
> > >For both C and C++ the __bf16 type is supported on
> > >x86 systems with SSE2 and above enabled.
> > >
> > > +  Use __bf16 type for AVX512BF16 intrinsics.
> > Could you add more explanations. Like originally it's ..., now it's
> > ..., and what's the difference when users compile the same source
> > code(which contains
> > avx512bf16 intrinsics) with gcc12(and before) and GCC13.
> > > +  
> > >  
> > >
> > >  
> > > --
> > > 2.18.2
> > >
> Yes,  changed it. Thanks a lot!
> 
> Subject: [PATCH] Mention Intel __bf16 support in AVX512BF16 intrinsics.
> 
> ---
>  htdocs/gcc-13/changes.html | 6 ++
>  1 file changed, 6 insertions(+)
> 
> diff --git a/htdocs/gcc-13/changes.html b/htdocs/gcc-13/changes.html index
> 7c6bfa6e..a35f4fab 100644
> --- a/htdocs/gcc-13/changes.html
> +++ b/htdocs/gcc-13/changes.html
> @@ -230,6 +230,12 @@ a work-in-progress.
>For both C and C++ the __bf16 type is supported on
>x86 systems with SSE2 and above enabled.
>
> +  Use __bf16 type for AVX512BF16 intrinsics.
> + Previously we use  short to represent bf16. Now we introduced
> __bf16 to x86 psABI.
> +  So we switch intrinsics in AVX512BF16 to the new type __bf16.
> +  When users compile the same source code contains AVX512BF16
> + intrinsics with
> +  GCC13 need to support SSE2, which is different to GCC12 (and before).
> +  
>  
> 
>  
> --
> 2.18.2
> 
> BRs,
> Lingling

Sorry, modified again. New patch is as below.

htdocs/gcc-13/changes.html | 5 +
 1 file changed, 5 insertions(+)

diff --git a/htdocs/gcc-13/changes.html b/htdocs/gcc-13/changes.html index 
7c6bfa6e..7a5d2ab6 100644
--- a/htdocs/gcc-13/changes.html
+++ b/htdocs/gcc-13/changes.html
@@ -230,6 +230,11 @@ a work-in-progress.
   For both C and C++ the __bf16 type is supported on
   x86 systems with SSE2 and above enabled.
   
+  Use real __bf16 type for AVX512BF16 intrinsics. 
+ Previously  we use __bfloat16 which is typedef of short. Now we 
+ introduced real  __bf16 type to x86 psABI. Users need to 
+ adjust their  AVX512BF16-related source code when upgrading GCC12 to GCC13.
+  
 
 
 
--
2.18.2

BRs,
Lingling


RE: [wwwdocs] [GCC13] Mention Intel __bf16 support in AVX512BF16 intrinsics.

2022-11-01 Thread Kong, Lingling via Gcc-patches
> > diff --git a/htdocs/gcc-13/changes.html b/htdocs/gcc-13/changes.html
> > index 7c6bfa6e..cd0282f1 100644
> > --- a/htdocs/gcc-13/changes.html
> > +++ b/htdocs/gcc-13/changes.html
> > @@ -230,6 +230,8 @@ a work-in-progress.
> >For both C and C++ the __bf16 type is supported on
> >x86 systems with SSE2 and above enabled.
> >
> > +  Use __bf16 type for AVX512BF16 intrinsics.
> Could you add more explanations. Like originally it's ..., now it's ..., and 
> what's
> the difference when users compile the same source code(which contains
> avx512bf16 intrinsics) with gcc12(and before) and GCC13.
> > +  
> >  
> >
> >  
> > --
> > 2.18.2
> >
Yes,  changed it. Thanks a lot!

Subject: [PATCH] Mention Intel __bf16 support in AVX512BF16 intrinsics.

---
 htdocs/gcc-13/changes.html | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/htdocs/gcc-13/changes.html b/htdocs/gcc-13/changes.html
index 7c6bfa6e..a35f4fab 100644
--- a/htdocs/gcc-13/changes.html
+++ b/htdocs/gcc-13/changes.html
@@ -230,6 +230,12 @@ a work-in-progress.
   For both C and C++ the __bf16 type is supported on
   x86 systems with SSE2 and above enabled.
   
+  Use __bf16 type for AVX512BF16 intrinsics. Previously we use
+  short to represent bf16. Now we introduced __bf16 to x86 psABI.
+  So we switch intrinsics in AVX512BF16 to the new type __bf16.
+  When users compile the same source code contains AVX512BF16 intrinsics with
+  GCC13 need to support SSE2, which is different to GCC12 (and before).
+  
 

 
--
2.18.2

BRs,
Lingling


[wwwdocs] [GCC13] Mention Intel __bf16 support in AVX512BF16 intrinsics.

2022-10-31 Thread Kong, Lingling via Gcc-patches
Hi

The patch is for mention Intel __bf16 support in AVX512BF16 intrinsics.
Ok for master ?

Thanks,
Lingling

---
 htdocs/gcc-13/changes.html | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/htdocs/gcc-13/changes.html b/htdocs/gcc-13/changes.html index 
7c6bfa6e..cd0282f1 100644
--- a/htdocs/gcc-13/changes.html
+++ b/htdocs/gcc-13/changes.html
@@ -230,6 +230,8 @@ a work-in-progress.
   For both C and C++ the __bf16 type is supported on
   x86 systems with SSE2 and above enabled.
   
+  Use __bf16 type for AVX512BF16 intrinsics.
+  
 
 
 
--
2.18.2



RE: [PATCH 4/6] Support Intel AVX-NE-CONVERT

2022-10-28 Thread Kong, Lingling via Gcc-patches
Hi,

Because we  switch intrinsics for avx512bf16 to the new type __bf16. Now we 
could use m128/256bh for vector bf16 type instead of m128/256bf16.
And unified builtin for avx512bf16/avxneconvert.

Thanks,
Lingling

> -Original Message-
> From: Hongtao Liu 
> Sent: Tuesday, October 25, 2022 1:23 PM
> To: Kong, Lingling 
> Cc: Liu, Hongtao ; gcc-patches@gcc.gnu.org; Jiang,
> Haochen 
> Subject: Re: [PATCH 4/6] Support Intel AVX-NE-CONVERT
> 
> On Mon, Oct 24, 2022 at 2:20 PM Kong, Lingling 
> wrote:
> >
> > > From: Gcc-patches
> > > 
> > > On Behalf Of Hongtao Liu via Gcc-patches
> > > Sent: Monday, October 17, 2022 1:47 PM
> > > To: Jiang, Haochen 
> > > Cc: Liu, Hongtao ; gcc-patches@gcc.gnu.org
> > > Subject: Re: [PATCH 4/6] Support Intel AVX-NE-CONVERT
> > >
> > > On Fri, Oct 14, 2022 at 3:58 PM Haochen Jiang via Gcc-patches
> > >  wrote:
> > > >
> > > > From: Kong Lingling 
> > > > +(define_insn "vbcstne2ps_"
> > > > +  [(set (match_operand:VF1_128_256 0 "register_operand" "=x")
> > > > +(vec_duplicate:VF1_128_256
> > > > + (unspec:SF
> > > > +  [(match_operand:HI 1 "memory_operand" "m")]
> > > > +  VBCSTNE)))]
> > > > +  "TARGET_AVXNECONVERT"
> > > > +  "vbcstne2ps\t{%1, %0|%0, %1}"
> > > > +  [(set_attr "prefix" "vex")
> > > > +  (set_attr "mode" "")])
> > > Since jakub has support bf16 software emulation, can we rewrite it
> > > with general rtl ir without unspec?
> > > Like (float_extend:SF (match_operand:BF "memory_operand" "m")
> > > > +
> > > > +(define_int_iterator VCVTNEBF16
> > > > +  [UNSPEC_VCVTNEEBF16SF
> > > > +   UNSPEC_VCVTNEOBF16SF])
> > > > +
> > > > +(define_int_attr vcvtnebf16type
> > > > +  [(UNSPEC_VCVTNEEBF16SF "ebf16")
> > > > +   (UNSPEC_VCVTNEOBF16SF "obf16")]) (define_insn
> > > > +"vcvtne2ps_"
> > > > +  [(set (match_operand:VF1_128_256 0 "register_operand" "=x")
> > > > +(unspec:VF1_128_256
> > > > +  [(match_operand: 1 "memory_operand" "m")]
> > > > + VCVTNEBF16))]
> > > > +  "TARGET_AVXNECONVERT"
> > > > +  "vcvtne2ps\t{%1, %0|%0, %1}"
> > > > +  [(set_attr "prefix" "vex")
> > > > +   (set_attr "mode" "")])
> > > Similar for this one and all those patterns below.
> >
> > That's great! Thanks for the review!
> > Now rewrite it without unspec and use float_extend for new define_insn.
> Ok.
> >
> > Thanks
> > Lingling
> >
> >
> 
> 
> --
> BR,
> Hongtao


0001-Support-Intel-AVX-NE-CONVERT.patch
Description: 0001-Support-Intel-AVX-NE-CONVERT.patch


[PATCH] i386: using __bf16 for AVX512BF16 intrinsics

2022-10-28 Thread Kong, Lingling via Gcc-patches
Hi,

Previously we use unsigned short to represent bf16. It's not a good expression, 
and at the time the front end didn't support bf16 type.
Now we introduced __bf16 to X86 psABI. So we can switch intrinsics to the new 
type.

Ok for trunk ?

Thanks,
Lingling

gcc/ChangeLog:

* config/i386/avx512bf16intrin.h (__attribute__): Change short to bf16.
(_mm_cvtsbh_ss): Ditto.
(_mm512_cvtne2ps_pbh): Ditto.
(_mm512_mask_cvtne2ps_pbh): Ditto.
(_mm512_maskz_cvtne2ps_pbh): Ditto.
* config/i386/avx512bf16vlintrin.h (__attribute__): Ditto.
(_mm256_cvtne2ps_pbh): Ditto.
(_mm256_mask_cvtne2ps_pbh): Ditto.
(_mm256_maskz_cvtne2ps_pbh): Ditto.
(_mm_cvtne2ps_pbh): Ditto.
(_mm_mask_cvtne2ps_pbh): Ditto.
(_mm_maskz_cvtne2ps_pbh): Ditto.
(_mm_cvtness_sbh): Ditto.
* config/i386/i386-builtin-types.def (V8BF): Add new
DEF_VECTOR_TYPE for BFmode.
(V16BF): Ditto.
(V32BF): Ditto.
* config/i386/i386-builtin.def (BDESC): Fixed builtins.
* config/i386/i386-expand.cc (ix86_expand_args_builtin): Changed
avx512bf16 ix86_builtin_func_type included HI to BF.
* config/i386/immintrin.h: Add SSE2 depend for avx512bf16.
* config/i386/sse.md (TARGET_AVX512VL): Changed HI vector to BF
vector.
(avx512f_cvtneps2bf16_v4sf): New define_expand.
(*avx512f_cvtneps2bf16_v4sf): New define_insn.
(avx512f_cvtneps2bf16_v4sf_maskz):Ditto.
(avx512f_cvtneps2bf16_v4sf_mask): Ditto.
(avx512f_cvtneps2bf16_v4sf_mask_1): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512bf16-cvtsbh2ss-1.c: Add fpmath option.
* gcc.target/i386/avx512bf16-vdpbf16ps-2.c: Fixed
scan-assembler.
* gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c: Add x/y suffix
for vcvtneps2bf16.
* gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c: Ditto.
---
 gcc/config/i386/avx512bf16intrin.h|  12 +--
 gcc/config/i386/avx512bf16vlintrin.h  |  29 ++---
 gcc/config/i386/i386-builtin-types.def|  51 -
 gcc/config/i386/i386-builtin.def  |  54 +-
 gcc/config/i386/i386-expand.cc|  48 -
 gcc/config/i386/immintrin.h   |   2 +
 gcc/config/i386/sse.md| 101 ++
 .../gcc.target/i386/avx512bf16-cvtsbh2ss-1.c  |   2 +-
 .../gcc.target/i386/avx512bf16-vdpbf16ps-2.c  |   2 +-
 .../i386/avx512bf16vl-cvtness2sbh-1.c |   2 +-
 .../i386/avx512bf16vl-vcvtneps2bf16-1.c   |  12 +--
 11 files changed, 189 insertions(+), 126 deletions(-)

diff --git a/gcc/config/i386/avx512bf16intrin.h 
b/gcc/config/i386/avx512bf16intrin.h
index b6e9ddad157..ea1d0125b3f 100644
--- a/gcc/config/i386/avx512bf16intrin.h
+++ b/gcc/config/i386/avx512bf16intrin.h
@@ -35,16 +35,16 @@
 #endif /* __AVX512BF16__ */
 
 /* Internal data types for implementing the intrinsics.  */
-typedef short __v32bh __attribute__ ((__vector_size__ (64)));
+typedef __bf16 __v32bf __attribute__ ((__vector_size__ (64)));
 
 /* The Intel API is flexible enough that we must allow aliasing with other
vector types, and their scalar components.  */
-typedef short __m512bh __attribute__ ((__vector_size__ (64), __may_alias__));
+typedef __bf16 __m512bh __attribute__ ((__vector_size__ (64), __may_alias__));
 
 /* Convert One BF16 Data to One Single Float Data.  */
 extern __inline float
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cvtsbh_ss (__bfloat16 __A)
+_mm_cvtsbh_ss (__bf16 __A)
 {
   union{ float a; unsigned int b;} __tmp;
   __tmp.b = ((unsigned int)(__A)) << 16;
@@ -57,21 +57,21 @@ extern __inline __m512bh
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_cvtne2ps_pbh (__m512 __A, __m512 __B)
 {
-  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi(__A, __B);
+  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf(__A, __B);
 }
 
 extern __inline __m512bh
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_cvtne2ps_pbh (__m512bh __A, __mmask32 __B, __m512 __C, __m512 __D)
 {
-  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi_mask(__C, __D, __A, __B);
+  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf_mask(__C, __D, __A, __B);
 }
 
 extern __inline __m512bh
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_maskz_cvtne2ps_pbh (__mmask32 __A, __m512 __B, __m512 __C)
 {
-  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi_maskz(__B, __C, __A);
+  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf_maskz(__B, __C, __A);
 }
 
 /* vcvtneps2bf16 */
diff --git a/gcc/config/i386/avx512bf16vlintrin.h 
b/gcc/config/i386/avx512bf16vlintrin.h
index 969335ff358..56c28f14cf6 100644
--- a/gcc/config/i386/avx512bf16vlintrin.h
+++ b/gcc/config/i386/avx512bf16vlintrin.h
@@ -35,57 +35,58 @@
 #endif /* __AVX512BF16__ */
 
 /* Internal data types for 

RE: [PATCH 4/6] Support Intel AVX-NE-CONVERT

2022-10-24 Thread Kong, Lingling via Gcc-patches
> From: Gcc-patches 
> On Behalf Of Hongtao Liu via Gcc-patches
> Sent: Monday, October 17, 2022 1:47 PM
> To: Jiang, Haochen 
> Cc: Liu, Hongtao ; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH 4/6] Support Intel AVX-NE-CONVERT
>
> On Fri, Oct 14, 2022 at 3:58 PM Haochen Jiang via Gcc-patches
>  wrote:
> >
> > From: Kong Lingling 
> > +(define_insn "vbcstne2ps_"
> > +  [(set (match_operand:VF1_128_256 0 "register_operand" "=x")
> > +(vec_duplicate:VF1_128_256
> > + (unspec:SF
> > +  [(match_operand:HI 1 "memory_operand" "m")]
> > +  VBCSTNE)))]
> > +  "TARGET_AVXNECONVERT"
> > +  "vbcstne2ps\t{%1, %0|%0, %1}"
> > +  [(set_attr "prefix" "vex")
> > +  (set_attr "mode" "")])
> Since jakub has support bf16 software emulation, can we rewrite it
> with general rtl ir without unspec?
> Like (float_extend:SF (match_operand:BF "memory_operand" "m")
> > +
> > +(define_int_iterator VCVTNEBF16
> > +  [UNSPEC_VCVTNEEBF16SF
> > +   UNSPEC_VCVTNEOBF16SF])
> > +
> > +(define_int_attr vcvtnebf16type
> > +  [(UNSPEC_VCVTNEEBF16SF "ebf16")
> > +   (UNSPEC_VCVTNEOBF16SF "obf16")])
> > +(define_insn "vcvtne2ps_"
> > +  [(set (match_operand:VF1_128_256 0 "register_operand" "=x")
> > +(unspec:VF1_128_256
> > +  [(match_operand: 1 "memory_operand" "m")]
> > + VCVTNEBF16))]
> > +  "TARGET_AVXNECONVERT"
> > +  "vcvtne2ps\t{%1, %0|%0, %1}"
> > +  [(set_attr "prefix" "vex")
> > +   (set_attr "mode" "")])
> Similar for this one and all those patterns below.

That's great! Thanks for the review! 
Now rewrite it without unspec and use float_extend for new define_insn.

Thanks
Lingling




0001-Support-Intel-AVX-NE-CONVERT.patch
Description: 0001-Support-Intel-AVX-NE-CONVERT.patch


RE: [PATCH] Enhance final_value_replacement_loop to handle bitop with an invariant induction.[PR105735]

2022-09-20 Thread Kong, Lingling via Gcc-patches
Thanks a lot, pushed to trunk.

> Hi Richard,
> 
> Thanks again for your reviewing.
> 
> > Yes, use else if for the bitwise induction.  Can you also make the new
> > case conditional on 'def'
> > (the compute_overall_effect_of_inner_loop) being chrec_dont_know?  If
> > that call produced something useful it will not be of either of the two 
> > special
> forms.
> > Thus like
> >
> >   if (def != chrec_dont_know)
> > /* Already OK.  */
> > ;
> >  else if ((bitinv_def = ...)
> > ..
> >  else if (tree_fits_uhwi_p (niter)
> >  ... bitwise induction case...)
> > ...
> >
> Yes, I fixed it in new patch. Thanks.
> Ok for master ?
> 
> Thanks,
> Lingling
> 
> > -Original Message-
> > From: Richard Biener 
> > Sent: Wednesday, September 14, 2022 4:16 PM
> > To: Kong, Lingling 
> > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao 
> > Subject: Re: [PATCH] Enhance final_value_replacement_loop to handle
> > bitop with an invariant induction.[PR105735]
> >
> > On Tue, Sep 13, 2022 at 9:54 AM Kong, Lingling
> > 
> > wrote:
> > >
> > > Hi Richard,
> > >
> > > Thanks you so much for reviewing this patch.  I really appreciate
> > > it. For these
> > review comments, I have made some changes.
> > >
> > > > That's a single-stmt match, you shouldn't use match.pd matching for 
> > > > this.
> > > > Instead just do
> > > >
> > > >   if (is_gimple_assign (stmt)
> > > >   && ((code = gimple_assign_rhs_code (stmt)), true)
> > > >   && (code == BIT_AND_EXPR || code == BIT_IOR_EXPR || code ==
> > > > BIT_XOR_EXPR))
> > >
> > > Yes, I fixed it and dropped modification for match.pd.
> > >
> > > > and pick gimple_assign_rhs{1,2} (stmt) as the operands.  The :c in
> > > > bit_op:c is redundant btw. - while the name suggests "with
> > > > invariant" you don't actually check for that.  But again, given
> > > > canonicalization rules the invariant will be rhs2 so above add
> > > >
> > > > && TREE_CODE (gimple_assign_rhs2 (stmt)) == INTEGER_CST
> > >
> > > For " with invariant", this needed op1 is invariant, and I used
> > `expr_invariant_in_loop_p (loop, match_op[0])` for check.
> > > And op2 just be PHI is ok. If op2 is INTEGER_CST, existing gcc can
> > > be directly
> > optimized and do not need modification.
> > >
> > > > you probably need dg-require-effective-target longlong, but is it
> > > > necessary to use long long for the testcases in the first place?
> > > > The IV seems to be unused, if it should match the variables bit
> > > > size use sizeof
> > > > (type) * 8
> > >
> > > Yes, It is not necessary to use long long for the testcases. I
> > > changed type to
> > unsigned int.
> > >
> > > > > +  inv = PHI_ARG_DEF_FROM_EDGE (header_phi, loop_preheader_edge
> > > > > + (loop));  return fold_build2 (code1, type, inv, match_op[0]);
> > > > > + }
> > > >
> > > > The } goes to the next line.
> > >
> > > Sorry, It might be something wrong with my use of gcc send-email format.
> > >
> > > > > +  tree bitinv_def;
> > > > > +  if ((bitinv_def
> > > >
> > > > please use else if here
> > >
> > > Sorry, If use the else if here, there is no corresponding above if.
> > > I'm not sure if
> > you mean change bitwise induction expression if to else if.
> >
> > Yes, use else if for the bitwise induction.  Can you also make the new
> > case conditional on 'def'
> > (the compute_overall_effect_of_inner_loop) being chrec_dont_know?  If
> > that call produced something useful it will not be of either of the two 
> > special
> forms.
> > Thus like
> >
> >   if (def != chrec_dont_know)
> > /* Already OK.  */
> > ;
> >  else if ((bitinv_def = ...)
> > ..
> >  else if (tree_fits_uhwi_p (niter)
> >  ... bitwise induction case...)
> > ...
> >
> > ?
> >
> > Otherwise looks OK now.
> >
> > Thanks,
> > Richard.
> >
> > > Do you agree with these changes?  Thanks again for taking a look.
> > >
> > > Thanks,
> > > Lingling
> > >
> > > > -Original Mess

RE: [PATCH] i386: Fixed vec_init_dup_v16bf [PR106887]

2022-09-16 Thread Kong, Lingling via Gcc-patches
Hi,
 
> >   machine_mode hvmode = (mode == V16HImode ? V8HImode
> >  : mode == V16HFmode ? V8HFmode
> > +: mode == V16BFmode ? V8BFmode
> Can it be written as switch case?
Sure, I fixed it in new patch. Thanks again for take a look.
OK for master ?

Thanks,
Lingling

> -Original Message-
> From: Hongtao Liu 
> Sent: Thursday, September 15, 2022 11:46 AM
> To: Kong, Lingling 
> Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao 
> Subject: Re: [PATCH] i386: Fixed vec_init_dup_v16bf [PR106887]
> 
> On Thu, Sep 15, 2022 at 11:36 AM Kong, Lingling via Gcc-patches  patc...@gcc.gnu.org> wrote:
> >
> > Hi
> >
> > The patch is to fix vec_init_dup_v16bf, add correct handle for v16bf mode in
> ix86_expand_vector_init_duplicate.
> > Add testcase with sse2 without avx2.
> >
> > OK for master?
> >
> > gcc/ChangeLog:
> >
> > PR target/106887
> > * config/i386/i386-expand.cc (ix86_expand_vector_init_duplicate):
> > Fixed V16BF mode case.
> >
> > gcc/testsuite/ChangeLog:
> >
> > PR target/106887
> > * gcc.target/i386/vect-bfloat16-2c.c: New test.
> > ---
> >  gcc/config/i386/i386-expand.cc|  1 +
> >  .../gcc.target/i386/vect-bfloat16-2c.c| 76 +++
> >  2 files changed, 77 insertions(+)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/vect-bfloat16-2c.c
> >
> > diff --git a/gcc/config/i386/i386-expand.cc
> > b/gcc/config/i386/i386-expand.cc index d7b49c99dc8..9451c561489 100644
> > --- a/gcc/config/i386/i386-expand.cc
> > +++ b/gcc/config/i386/i386-expand.cc
> > @@ -15111,6 +15111,7 @@ ix86_expand_vector_init_duplicate (bool
> mmx_ok, machine_mode mode,
> > {
> >   machine_mode hvmode = (mode == V16HImode ? V8HImode
> >  : mode == V16HFmode ? V8HFmode
> > +: mode == V16BFmode ? V8BFmode
> Can it be written as switch case?
> >  : V16QImode);
> >   rtx x = gen_reg_rtx (hvmode);
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/vect-bfloat16-2c.c
> > b/gcc/testsuite/gcc.target/i386/vect-bfloat16-2c.c
> > new file mode 100644
> > index 000..bead94e46a1
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/vect-bfloat16-2c.c
> > @@ -0,0 +1,76 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-mf16c -msse2 -mno-avx2 -O2" } */
> > +
> > +typedef __bf16 v8bf __attribute__ ((__vector_size__ (16))); typedef
> > +__bf16 v16bf __attribute__ ((__vector_size__ (32)));
> > +
> > +#define VEC_EXTRACT(V,S,IDX)   \
> > +  S\
> > +  __attribute__((noipa))   \
> > +  vec_extract_##V##_##IDX (V v)\
> > +  {\
> > +return v[IDX]; \
> > +  }
> > +
> > +#define VEC_SET(V,S,IDX)   \
> > +  V\
> > +  __attribute__((noipa))   \
> > +  vec_set_##V##_##IDX (V v, S s)   \
> > +  {\
> > +v[IDX] = s;\
> > +return v;  \
> > +  }
> > +
> > +v8bf
> > +vec_init_v8bf (__bf16 a1, __bf16 a2, __bf16 a3, __bf16 a4,
> > +  __bf16 a5,  __bf16 a6, __bf16 a7, __bf16 a8) {
> > +return __extension__ (v8bf) {a1, a2, a3, a4, a5, a6, a7, a8}; }
> > +
> > +v16bf
> > +vec_init_v16bf (__bf16 a1, __bf16 a2, __bf16 a3, __bf16 a4,
> > +  __bf16 a5,  __bf16 a6, __bf16 a7, __bf16 a8,
> > +  __bf16 a9,  __bf16 a10, __bf16 a11, __bf16 a12,
> > +  __bf16 a13,  __bf16 a14, __bf16 a15, __bf16 a16) {
> > +return __extension__ (v16bf) {a1, a2, a3, a4, a5, a6, a7, a8,
> > + a9, a10, a11, a12, a13, a14, a15,
> > +a16}; }
> > +
> > +v8bf
> > +vec_init_dup_v8bf (__bf16 a1)
> > +{
> > +return __extension__ (v8bf) {a1, a1, a1, a1, a1, a1, a1, a1}; }
> > +
> > +v16bf
> > +vec_init_dup_v16bf (__bf16 a1)
> > +{
> > +return __extension__ (v16bf) {a1, a1, a1, a1, a1, a1, a1, a1,
> > + a1, a1, a1, a1, a1, a1, a1, a1}; }
> > +
> > +/* { dg-final { scan-assembler-times "vpunpcklwd"

RE: [PATCH] Enhance final_value_replacement_loop to handle bitop with an invariant induction.[PR105735]

2022-09-15 Thread Kong, Lingling via Gcc-patches
Hi Richard,

Thanks again for your reviewing.

> Yes, use else if for the bitwise induction.  Can you also make the new case
> conditional on 'def'
> (the compute_overall_effect_of_inner_loop) being chrec_dont_know?  If that
> call produced something useful it will not be of either of the two special 
> forms.
> Thus like
> 
>   if (def != chrec_dont_know)
> /* Already OK.  */
> ;
>  else if ((bitinv_def = ...)
> ..
>  else if (tree_fits_uhwi_p (niter)
>  ... bitwise induction case...)
> ...
>
Yes, I fixed it in new patch. Thanks.
Ok for master ?

Thanks,
Lingling

> -Original Message-
> From: Richard Biener 
> Sent: Wednesday, September 14, 2022 4:16 PM
> To: Kong, Lingling 
> Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao 
> Subject: Re: [PATCH] Enhance final_value_replacement_loop to handle bitop
> with an invariant induction.[PR105735]
> 
> On Tue, Sep 13, 2022 at 9:54 AM Kong, Lingling 
> wrote:
> >
> > Hi Richard,
> >
> > Thanks you so much for reviewing this patch.  I really appreciate it. For 
> > these
> review comments, I have made some changes.
> >
> > > That's a single-stmt match, you shouldn't use match.pd matching for this.
> > > Instead just do
> > >
> > >   if (is_gimple_assign (stmt)
> > >   && ((code = gimple_assign_rhs_code (stmt)), true)
> > >   && (code == BIT_AND_EXPR || code == BIT_IOR_EXPR || code ==
> > > BIT_XOR_EXPR))
> >
> > Yes, I fixed it and dropped modification for match.pd.
> >
> > > and pick gimple_assign_rhs{1,2} (stmt) as the operands.  The :c in
> > > bit_op:c is redundant btw. - while the name suggests "with
> > > invariant" you don't actually check for that.  But again, given
> > > canonicalization rules the invariant will be rhs2 so above add
> > >
> > > && TREE_CODE (gimple_assign_rhs2 (stmt)) == INTEGER_CST
> >
> > For " with invariant", this needed op1 is invariant, and I used
> `expr_invariant_in_loop_p (loop, match_op[0])` for check.
> > And op2 just be PHI is ok. If op2 is INTEGER_CST, existing gcc can be 
> > directly
> optimized and do not need modification.
> >
> > > you probably need dg-require-effective-target longlong, but is it
> > > necessary to use long long for the testcases in the first place?
> > > The IV seems to be unused, if it should match the variables bit size
> > > use sizeof
> > > (type) * 8
> >
> > Yes, It is not necessary to use long long for the testcases. I changed type 
> > to
> unsigned int.
> >
> > > > +  inv = PHI_ARG_DEF_FROM_EDGE (header_phi, loop_preheader_edge
> > > > + (loop));  return fold_build2 (code1, type, inv, match_op[0]); }
> > >
> > > The } goes to the next line.
> >
> > Sorry, It might be something wrong with my use of gcc send-email format.
> >
> > > > +  tree bitinv_def;
> > > > +  if ((bitinv_def
> > >
> > > please use else if here
> >
> > Sorry, If use the else if here, there is no corresponding above if. I'm not 
> > sure if
> you mean change bitwise induction expression if to else if.
> 
> Yes, use else if for the bitwise induction.  Can you also make the new case
> conditional on 'def'
> (the compute_overall_effect_of_inner_loop) being chrec_dont_know?  If that
> call produced something useful it will not be of either of the two special 
> forms.
> Thus like
> 
>   if (def != chrec_dont_know)
> /* Already OK.  */
> ;
>  else if ((bitinv_def = ...)
> ..
>  else if (tree_fits_uhwi_p (niter)
>      ... bitwise induction case...)
> ...
> 
> ?
> 
> Otherwise looks OK now.
> 
> Thanks,
> Richard.
> 
> > Do you agree with these changes?  Thanks again for taking a look.
> >
> > Thanks,
> > Lingling
> >
> > > -Original Message-
> > > From: Richard Biener 
> > > Sent: Tuesday, August 23, 2022 3:27 PM
> > > To: Kong, Lingling 
> > > Cc: Liu, Hongtao ; gcc-patches@gcc.gnu.org
> > > Subject: Re: [PATCH] Enhance final_value_replacement_loop to handle
> > > bitop with an invariant induction.[PR105735]
> > >
> > > On Thu, Aug 18, 2022 at 8:48 AM Kong, Lingling via Gcc-patches  > > patc...@gcc.gnu.org> wrote:
> > > >
> > > > Hi,
> > > >
> > > > This patch is for pr105735/pr101991. It will enable below optimization:
> > > > {
> > > > -  long unsigned int bit;
&

[PATCH] i386: Fixed vec_init_dup_v16bf [PR106887]

2022-09-14 Thread Kong, Lingling via Gcc-patches
Hi

The patch is to fix vec_init_dup_v16bf, add correct handle for v16bf mode in 
ix86_expand_vector_init_duplicate.
Add testcase with sse2 without avx2.

OK for master? 

gcc/ChangeLog:

PR target/106887
* config/i386/i386-expand.cc (ix86_expand_vector_init_duplicate):
Fixed V16BF mode case.

gcc/testsuite/ChangeLog:

PR target/106887
* gcc.target/i386/vect-bfloat16-2c.c: New test.
---
 gcc/config/i386/i386-expand.cc|  1 +
 .../gcc.target/i386/vect-bfloat16-2c.c| 76 +++
 2 files changed, 77 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/vect-bfloat16-2c.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc 
index d7b49c99dc8..9451c561489 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -15111,6 +15111,7 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, 
machine_mode mode,
{
  machine_mode hvmode = (mode == V16HImode ? V8HImode
 : mode == V16HFmode ? V8HFmode
+: mode == V16BFmode ? V8BFmode
 : V16QImode);
  rtx x = gen_reg_rtx (hvmode);
 
diff --git a/gcc/testsuite/gcc.target/i386/vect-bfloat16-2c.c 
b/gcc/testsuite/gcc.target/i386/vect-bfloat16-2c.c
new file mode 100644
index 000..bead94e46a1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-bfloat16-2c.c
@@ -0,0 +1,76 @@
+/* { dg-do compile } */
+/* { dg-options "-mf16c -msse2 -mno-avx2 -O2" } */
+
+typedef __bf16 v8bf __attribute__ ((__vector_size__ (16))); typedef 
+__bf16 v16bf __attribute__ ((__vector_size__ (32)));
+
+#define VEC_EXTRACT(V,S,IDX)   \
+  S\
+  __attribute__((noipa))   \
+  vec_extract_##V##_##IDX (V v)\
+  {\
+return v[IDX]; \
+  }
+
+#define VEC_SET(V,S,IDX)   \
+  V\
+  __attribute__((noipa))   \
+  vec_set_##V##_##IDX (V v, S s)   \
+  {\
+v[IDX] = s;\
+return v;  \
+  }
+
+v8bf
+vec_init_v8bf (__bf16 a1, __bf16 a2, __bf16 a3, __bf16 a4,
+  __bf16 a5,  __bf16 a6, __bf16 a7, __bf16 a8) {
+return __extension__ (v8bf) {a1, a2, a3, a4, a5, a6, a7, a8}; }
+
+v16bf
+vec_init_v16bf (__bf16 a1, __bf16 a2, __bf16 a3, __bf16 a4,
+  __bf16 a5,  __bf16 a6, __bf16 a7, __bf16 a8,
+  __bf16 a9,  __bf16 a10, __bf16 a11, __bf16 a12,
+  __bf16 a13,  __bf16 a14, __bf16 a15, __bf16 a16) {
+return __extension__ (v16bf) {a1, a2, a3, a4, a5, a6, a7, a8,
+ a9, a10, a11, a12, a13, a14, a15, a16}; }
+
+v8bf
+vec_init_dup_v8bf (__bf16 a1)
+{
+return __extension__ (v8bf) {a1, a1, a1, a1, a1, a1, a1, a1}; }
+
+v16bf
+vec_init_dup_v16bf (__bf16 a1)
+{
+return __extension__ (v16bf) {a1, a1, a1, a1, a1, a1, a1, a1,
+ a1, a1, a1, a1, a1, a1, a1, a1};
+}
+
+/* { dg-final { scan-assembler-times "vpunpcklwd" 12 } } */
+/* { dg-final { scan-assembler-times "vpunpckldq" 6 } } */
+/* { dg-final { scan-assembler-times "vpunpcklqdq" 3 } } */
+
+VEC_EXTRACT (v8bf, __bf16, 0);
+VEC_EXTRACT (v8bf, __bf16, 4);
+VEC_EXTRACT (v16bf, __bf16, 0);
+VEC_EXTRACT (v16bf, __bf16, 3);
+VEC_EXTRACT (v16bf, __bf16, 8);
+VEC_EXTRACT (v16bf, __bf16, 15);
+/* { dg-final { scan-assembler-times "vpsrldq\[\t ]*\\\$8" 1 } } */
+/* { dg-final { scan-assembler-times "vpsrldq\[\t ]*\\\$6" 1 } } */
+/* { dg-final { scan-assembler-times "vpsrldq\[\t ]*\\\$14" 1 } } */
+/* { dg-final { scan-assembler-times "vextract" 4 } } */
+
+VEC_SET (v8bf, __bf16, 4);
+VEC_SET (v16bf, __bf16, 3);
+VEC_SET (v16bf, __bf16, 8);
+VEC_SET (v16bf, __bf16, 15);
+/* { dg-final { scan-assembler-times "vpblendw" 3 { target { ! ia32 } } 
+} } */
+
+/* { dg-final { scan-assembler-times "vpinsrw" 30 { target ia32 } } } 
+*/
+
--
2.18.2



RE: [PATCH] Enhance final_value_replacement_loop to handle bitop with an invariant induction.[PR105735]

2022-09-13 Thread Kong, Lingling via Gcc-patches
Hi Richard,

Thanks you so much for reviewing this patch.  I really appreciate it. For these 
review comments, I have made some changes.

> That's a single-stmt match, you shouldn't use match.pd matching for this.
> Instead just do
> 
>   if (is_gimple_assign (stmt)
>   && ((code = gimple_assign_rhs_code (stmt)), true)
>   && (code == BIT_AND_EXPR || code == BIT_IOR_EXPR || code ==
> BIT_XOR_EXPR))

Yes, I fixed it and dropped modification for match.pd.

> and pick gimple_assign_rhs{1,2} (stmt) as the operands.  The :c in bit_op:c is
> redundant btw. - while the name suggests "with invariant" you don't actually
> check for that.  But again, given canonicalization rules the invariant will 
> be rhs2
> so above add
> 
> && TREE_CODE (gimple_assign_rhs2 (stmt)) == INTEGER_CST

For " with invariant", this needed op1 is invariant, and I used 
`expr_invariant_in_loop_p (loop, match_op[0])` for check.
And op2 just be PHI is ok. If op2 is INTEGER_CST, existing gcc can be directly 
optimized and do not need modification.

> you probably need dg-require-effective-target longlong, but is it necessary to
> use long long for the testcases in the first place?
> The IV seems to be unused, if it should match the variables bit size use 
> sizeof
> (type) * 8

Yes, It is not necessary to use long long for the testcases. I changed type to 
unsigned int.

> > +  inv = PHI_ARG_DEF_FROM_EDGE (header_phi, loop_preheader_edge
> > + (loop));  return fold_build2 (code1, type, inv, match_op[0]); }
> 
> The } goes to the next line.

Sorry, It might be something wrong with my use of gcc send-email format.

> > +  tree bitinv_def;
> > +  if ((bitinv_def
> 
> please use else if here

Sorry, If use the else if here, there is no corresponding above if. I'm not 
sure if you mean change bitwise induction expression if to else if.

Do you agree with these changes?  Thanks again for taking a look.

Thanks,
Lingling

> -Original Message-
> From: Richard Biener 
> Sent: Tuesday, August 23, 2022 3:27 PM
> To: Kong, Lingling 
> Cc: Liu, Hongtao ; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH] Enhance final_value_replacement_loop to handle bitop
> with an invariant induction.[PR105735]
> 
> On Thu, Aug 18, 2022 at 8:48 AM Kong, Lingling via Gcc-patches  patc...@gcc.gnu.org> wrote:
> >
> > Hi,
> >
> > This patch is for pr105735/pr101991. It will enable below optimization:
> > {
> > -  long unsigned int bit;
> > -
> > -   [local count: 32534376]:
> > -
> > -   [local count: 1041207449]:
> > -  # tmp_10 = PHI 
> > -  # bit_12 = PHI 
> > -  tmp_7 = bit2_6(D) & tmp_10;
> > -  bit_8 = bit_12 + 1;
> > -  if (bit_8 != 32)
> > -goto ; [96.97%]
> > -  else
> > -goto ; [3.03%]
> > -
> > -   [local count: 1009658865]:
> > -  goto ; [100.00%]
> > -
> > -   [local count: 32534376]:
> > -  # tmp_11 = PHI 
> > -  return tmp_11;
> > +  tmp_11 = tmp_4(D) & bit2_6(D);
> > +  return tmp_11;
> >
> > }
> >
> > Ok for master ?
> >
> > gcc/ChangeLog:
> >
> > PR middle-end/105735
> > * match.pd (bitop_with_inv_p): New match.
> > * tree-scalar-evolution.cc (gimple_bitop_with_inv_p): Declare.
> > (analyze_and_compute_bitop_with_inv_effect): New function.
> > (final_value_replacement_loop): Enhanced to handle bitop
> > with inv induction.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/i386/pr105735-1.c: New test.
> > * gcc.target/i386/pr105735-2.c: New test.
> > ---
> >  gcc/match.pd   |  4 +
> >  gcc/testsuite/gcc.target/i386/pr105735-1.c | 88 ++
> gcc/testsuite/gcc.target/i386/pr105735-2.c | 28 +++
> >  gcc/tree-scalar-evolution.cc   | 59 +++
> >  4 files changed, 179 insertions(+)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr105735-1.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr105735-2.c
> >
> > diff --git a/gcc/match.pd b/gcc/match.pd index
> > 562138a8034..cfe593ebb02 100644
> > --- a/gcc/match.pd
> > +++ b/gcc/match.pd
> > @@ -8050,6 +8050,10 @@ and,
> >   (bit_not
> >(nop_convert1? (bit_xor@0 (convert2? (lshift integer_onep@1 @2))
> > @3
> >
> > +(for bit_op (bit_and bit_ior bit_xor)  (match (bitop_with_inv_p @0
> > +@1)
> > +  (bit_op:c @0 @1)))
> > +
> 
> That's a single-stmt match, you shouldn't use match.pd matching for this.
> Instead just do
> 
>   if (is_gimple_a

RE: [PATCH] x86: Handle V8BF in expand_vec_perm_broadcast_1

2022-09-02 Thread Kong, Lingling via Gcc-patches
Hi,

I fixed it in a new patch.  And added BF vector mode in SUBST_V and 
avx512fmaskhalfmode for @vec_interleave_high.
Ok for trunk ?

> > Hi,
> >
> > Handle E_V8BFmode in expand_vec_perm_broadcast_1 and
> ix86_expand_vector_init_duplicate.
> > Ok for trunk?
> >
> > gcc/ChangeLog:
> >
> > PR target/106742
> > * config/i386/i386-expand.cc (ix86_expand_vector_init_duplicate):
> > Handle V8BF mode.
> > (expand_vec_perm_broadcast_1): Ditto.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/i386/pr106742.c: New test.
> > ---
> >  gcc/config/i386/i386-expand.cc   | 17 -
> >  gcc/testsuite/gcc.target/i386/pr106742.c | 10 ++
> >  2 files changed, 22 insertions(+), 5 deletions(-)  create mode 100644
> > gcc/testsuite/gcc.target/i386/pr106742.c
> >
> > diff --git a/gcc/config/i386/i386-expand.cc
> > b/gcc/config/i386/i386-expand.cc index 4b216308a18..a08222fe1b6 100644
> > --- a/gcc/config/i386/i386-expand.cc
> > +++ b/gcc/config/i386/i386-expand.cc
> > @@ -15030,11 +15030,15 @@ ix86_expand_vector_init_duplicate (bool
> mmx_ok, machine_mode mode,
> >   dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
> >   dperm.one_operand_p = true;
> >
> > - if (mode == V8HFmode)
> > + if (mode == V8HFmode || mode == V8BFmode)
> > {
> > - tmp1 = force_reg (HFmode, val);
> > + rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
> > + tmp1 = mode == V8HFmode ? force_reg (HFmode, val)
> > + : force_reg (BFmode, val);
> tmp1 = force_reg (GET_MODE_INNER (mode), val);
> >   tmp2 = gen_reg_rtx (mode);
> > - emit_insn (gen_vec_setv8hf_0 (tmp2, CONST0_RTX (mode), tmp1));
> > + gen_vec_set_0 = mode == V8HFmode ? gen_vec_setv8hf_0
> > +  : gen_vec_setv8bf_0;
> add @ to vec_set_0 as (define_insn "@vec_set_0" and pass
> mode to vec_set_0 as emit_insn (gen_vec_set_0 (mode, tmp2, CONST0_RTX
> (mode), tmp1));
> > + emit_insn (gen_vec_set_0 (tmp2, CONST0_RTX (mode),
> > + tmp1));
> 
> >   tmp1 = gen_lowpart (mode, tmp2);
> > }
> >   else
> > @@ -21822,17 +21826,20 @@ expand_vec_perm_broadcast_1 (struct
> expand_vec_perm_d *d)
> >return true;
> >
> >  case E_V8HFmode:
> > +case E_V8BFmode:
> >/* This can be implemented via interleave and pshufd.  */
> >if (d->testing_p)
> > return true;
> >
> >if (elt >= nelt2)
> > {
> > - gen = gen_vec_interleave_highv8hf;
> > + gen = vmode == V8HFmode ? gen_vec_interleave_highv8hf
> > + : gen_vec_interleave_highv8bf;
> Similar, add @ to define_insn and pass gen_vec_interleave.
> >   elt -= nelt2;
> > }
> >else
> > -   gen = gen_vec_interleave_lowv8hf;
> > +   gen = vmode == V8HFmode ? gen_vec_interleave_lowv8hf
> > +   : gen_vec_interleave_lowv8bf;
> >nelt2 /= 2;
> >
> >dest = gen_reg_rtx (vmode);
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106742.c
> > b/gcc/testsuite/gcc.target/i386/pr106742.c
> > new file mode 100644
> > index 000..4a53cd49902
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106742.c
> > @@ -0,0 +1,10 @@
> > +/* { dg-do compile } */
> > +/* { dg-options " -msse2 -mno-avx2 -O1" } */ typedef __bf16 v8bf
> > +__attribute__ ((__vector_size__ (16)));
> > +
> > +v8bf
> > +vec_init_dup_v8bf (__bf16 a1)
> > +{
> > +  return __extension__ (v8bf) { a1, a1, a1, a1, a1, a1, a1, a1 }; }
> > +/* { dg-final { scan-assembler-times "punpcklwd" 1} } */
> > --
> > 2.18.2
> >
> 
> 
> --
> BR,
> Hongtao


0001-x86-Handle-V8BF-in-expand_vec_perm_broadcast_1.patch
Description: 0001-x86-Handle-V8BF-in-expand_vec_perm_broadcast_1.patch


RE: [PATCH] middle-end: Add MULT_EXPR recognition for cond scalar reduction

2022-08-31 Thread Kong, Lingling via Gcc-patches
Hi  Richard,  could you help to have a look for the patch ?

Ok for master ?

> Hi,
> 
> The conditional mult reduction cannot be recognized with current GCC. The
> following loop cannot be vectorized.
> Now add MULT_EXPR recognition for conditional scalar reduction.
> 
> float summa(int n, float *arg1, float *arg2)
> {
> int i;
> float res1 = 1.0;
> for(i = 0; i < n; i++) {
>   if(arg2[i])
> res1 *= arg1[i];
> }
> return res1;
> }
> 
> gcc/ChangeLog:
> 
>   * tree-if-conv.cc (is_cond_scalar_reduction): Add MULT_EXPR
>   recognition.
> 
> gcc/testsuite/ChangeLog:
> 
>   * gcc.dg/tree-ssa/gen-vect-34.c: New test.
>   * gcc.dg/vect/vect-ifcvt-18.c: New test.
> ---
>  gcc/testsuite/gcc.dg/tree-ssa/gen-vect-34.c | 16 +
>  gcc/testsuite/gcc.dg/vect/vect-ifcvt-18.c   | 38 +
>  gcc/tree-if-conv.cc |  1 +
>  3 files changed, 55 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/gen-vect-34.c
>  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-ifcvt-18.c
> 
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/gen-vect-34.c
> b/gcc/testsuite/gcc.dg/tree-ssa/gen-vect-34.c
> new file mode 100644
> index 000..8d2d36401fe
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/gen-vect-34.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -fdump-tree-vect-details" } */
> +/* { dg-additional-options "-mavx2" { target { x86_64-*-* i?86-*-* } }
> +} */
> +
> +float summul(int n, float *arg1, float *arg2)
> +{
> +int i;
> +float res1 = 1.0;
> +for(i = 0; i < n; i++) {
> +  if(arg2[i])
> +res1 *= arg1[i];
> +}
> +return res1;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" {
> +target { ! { avr-*-* pru-*-* } } } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-ifcvt-18.c
> b/gcc/testsuite/gcc.dg/vect/vect-ifcvt-18.c
> new file mode 100644
> index 000..c1d3c27d819
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-ifcvt-18.c
> @@ -0,0 +1,38 @@
> +/* { dg-require-effective-target vect_condition } */
> +/* { dg-require-effective-target vect_float } */
> +/* { dg-additional-options "-Ofast -mavx" { target avx_runtime } } */
> +
> +
> +int A0[4] = {36,39,42,45};
> +int B0[4] = {42,42,0,42};
> +float A1[8] = {36,39,42,45,43,32,21,12}; float B1[8] =
> +{42,42,0,42,42,42,0,42}; double A2[16] =
> +{36,39,42,45,43,32,21,12,23,34,45,56,42,78,89,11};
> +double B2[16] = {42,42,0,42,42,42,42,42,42,42,42,42,0,42,42,42};
> +
> +int main ()
> +{
> +  int i, j;
> +  int res0 = 1;
> +  float res1 = 1.0;
> +  double res2 = 1.0;
> +
> +  for (i = 0; i < 4; i++)
> +if (B0[i])
> +  res0 *= A0[i];
> +
> +  for (i = 0; i < 8; i++)
> +if (B1[i])
> +  res1 *= A1[i];
> +
> +  for (i = 0; i < 16; i++)
> +if (B2[i])
> +  res2 *= A2[i];
> +  /* check results:  */
> +  if (res0 != 63180 || res1 != 1043228160.00
> +  ||res2 != 3296728515318523101184.00)
> +  __builtin_abort ();
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump "vectorized 3 loops" "vect" { target
> +i?86-*-* x86_64-*-* } } } */
> diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc index
> 1c8e1a45234..bac29fb5574 100644
> --- a/gcc/tree-if-conv.cc
> +++ b/gcc/tree-if-conv.cc
> @@ -1739,6 +1739,7 @@ is_cond_scalar_reduction (gimple *phi, gimple
> **reduc, tree arg_0, tree arg_1,
> 
>if (reduction_op != PLUS_EXPR
>&& reduction_op != MINUS_EXPR
> +  && reduction_op != MULT_EXPR
>&& reduction_op != BIT_IOR_EXPR
>&& reduction_op != BIT_XOR_EXPR
>&& reduction_op != BIT_AND_EXPR)
> --
> 2.18.2



[PATCH] x86: Handle V8BF in expand_vec_perm_broadcast_1

2022-08-31 Thread Kong, Lingling via Gcc-patches
Hi,

Handle E_V8BFmode in expand_vec_perm_broadcast_1 and 
ix86_expand_vector_init_duplicate.
Ok for trunk?

gcc/ChangeLog:

PR target/106742
* config/i386/i386-expand.cc (ix86_expand_vector_init_duplicate):
Handle V8BF mode.
(expand_vec_perm_broadcast_1): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr106742.c: New test.
---
 gcc/config/i386/i386-expand.cc   | 17 -
 gcc/testsuite/gcc.target/i386/pr106742.c | 10 ++
 2 files changed, 22 insertions(+), 5 deletions(-)  create mode 100644 
gcc/testsuite/gcc.target/i386/pr106742.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc 
index 4b216308a18..a08222fe1b6 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -15030,11 +15030,15 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, 
machine_mode mode,
  dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
  dperm.one_operand_p = true;
 
- if (mode == V8HFmode)
+ if (mode == V8HFmode || mode == V8BFmode)
{
- tmp1 = force_reg (HFmode, val);
+ rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
+ tmp1 = mode == V8HFmode ? force_reg (HFmode, val)
+ : force_reg (BFmode, val);
  tmp2 = gen_reg_rtx (mode);
- emit_insn (gen_vec_setv8hf_0 (tmp2, CONST0_RTX (mode), tmp1));
+ gen_vec_set_0 = mode == V8HFmode ? gen_vec_setv8hf_0
+  : gen_vec_setv8bf_0;
+ emit_insn (gen_vec_set_0 (tmp2, CONST0_RTX (mode), tmp1));
  tmp1 = gen_lowpart (mode, tmp2);
}
  else
@@ -21822,17 +21826,20 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d 
*d)
   return true;
 
 case E_V8HFmode:
+case E_V8BFmode:
   /* This can be implemented via interleave and pshufd.  */
   if (d->testing_p)
return true;
 
   if (elt >= nelt2)
{
- gen = gen_vec_interleave_highv8hf;
+ gen = vmode == V8HFmode ? gen_vec_interleave_highv8hf
+ : gen_vec_interleave_highv8bf;
  elt -= nelt2;
}
   else
-   gen = gen_vec_interleave_lowv8hf;
+   gen = vmode == V8HFmode ? gen_vec_interleave_lowv8hf
+   : gen_vec_interleave_lowv8bf;
   nelt2 /= 2;
 
   dest = gen_reg_rtx (vmode);
diff --git a/gcc/testsuite/gcc.target/i386/pr106742.c 
b/gcc/testsuite/gcc.target/i386/pr106742.c
new file mode 100644
index 000..4a53cd49902
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106742.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options " -msse2 -mno-avx2 -O1" } */
+typedef __bf16 v8bf __attribute__ ((__vector_size__ (16)));
+
+v8bf
+vec_init_dup_v8bf (__bf16 a1)
+{
+  return __extension__ (v8bf) { a1, a1, a1, a1, a1, a1, a1, a1 }; }
+/* { dg-final { scan-assembler-times "punpcklwd" 1} } */
--
2.18.2



[PATCH] middle-end: Add MULT_EXPR recognition for cond scalar reduction

2022-08-25 Thread Kong, Lingling via Gcc-patches
Hi,

The conditional mult reduction cannot be recognized with current GCC. The 
following loop cannot be vectorized.
Now add MULT_EXPR recognition for conditional scalar reduction.

float summa(int n, float *arg1, float *arg2)
{  
int i; 
float res1 = 1.0;
for(i = 0; i < n; i++) {
  if(arg2[i]) 
res1 *= arg1[i];
}  
return res1;   
}

gcc/ChangeLog:

* tree-if-conv.cc (is_cond_scalar_reduction): Add MULT_EXPR
recognition.

gcc/testsuite/ChangeLog:

* gcc.dg/tree-ssa/gen-vect-34.c: New test.
* gcc.dg/vect/vect-ifcvt-18.c: New test.
---
 gcc/testsuite/gcc.dg/tree-ssa/gen-vect-34.c | 16 +
 gcc/testsuite/gcc.dg/vect/vect-ifcvt-18.c   | 38 +
 gcc/tree-if-conv.cc |  1 +
 3 files changed, 55 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/gen-vect-34.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-ifcvt-18.c

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/gen-vect-34.c 
b/gcc/testsuite/gcc.dg/tree-ssa/gen-vect-34.c
new file mode 100644
index 000..8d2d36401fe
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/gen-vect-34.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-vect-details" } */
+/* { dg-additional-options "-mavx2" { target { x86_64-*-* i?86-*-* } } 
+} */
+
+float summul(int n, float *arg1, float *arg2)
+{  
+int i; 
+float res1 = 1.0;
+for(i = 0; i < n; i++) {
+  if(arg2[i]) 
+res1 *= arg1[i];
+}  
+return res1;   
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { 
+target { ! { avr-*-* pru-*-* } } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-ifcvt-18.c 
b/gcc/testsuite/gcc.dg/vect/vect-ifcvt-18.c
new file mode 100644
index 000..c1d3c27d819
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-ifcvt-18.c
@@ -0,0 +1,38 @@
+/* { dg-require-effective-target vect_condition } */
+/* { dg-require-effective-target vect_float } */
+/* { dg-additional-options "-Ofast -mavx" { target avx_runtime } } */
+
+
+int A0[4] = {36,39,42,45};
+int B0[4] = {42,42,0,42};
+float A1[8] = {36,39,42,45,43,32,21,12}; float B1[8] = 
+{42,42,0,42,42,42,0,42}; double A2[16] = 
+{36,39,42,45,43,32,21,12,23,34,45,56,42,78,89,11};
+double B2[16] = {42,42,0,42,42,42,42,42,42,42,42,42,0,42,42,42};
+
+int main ()
+{
+  int i, j;
+  int res0 = 1;
+  float res1 = 1.0;
+  double res2 = 1.0;
+
+  for (i = 0; i < 4; i++)
+if (B0[i])
+  res0 *= A0[i];
+
+  for (i = 0; i < 8; i++)
+if (B1[i])
+  res1 *= A1[i];
+  
+  for (i = 0; i < 16; i++)
+if (B2[i])
+  res2 *= A2[i];
+  /* check results:  */
+  if (res0 != 63180 || res1 != 1043228160.00
+  ||res2 != 3296728515318523101184.00)
+  __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "vectorized 3 loops" "vect" { target 
+i?86-*-* x86_64-*-* } } } */
diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc index 
1c8e1a45234..bac29fb5574 100644
--- a/gcc/tree-if-conv.cc
+++ b/gcc/tree-if-conv.cc
@@ -1739,6 +1739,7 @@ is_cond_scalar_reduction (gimple *phi, gimple **reduc, 
tree arg_0, tree arg_1,
 
   if (reduction_op != PLUS_EXPR
   && reduction_op != MINUS_EXPR
+  && reduction_op != MULT_EXPR
   && reduction_op != BIT_IOR_EXPR
   && reduction_op != BIT_XOR_EXPR
   && reduction_op != BIT_AND_EXPR)
--
2.18.2



RE: [PATCH] Enhance final_value_replacement_loop to handle bitop with an invariant induction.[PR105735]

2022-08-22 Thread Kong, Lingling via Gcc-patches
Hi  Richard,  could you help to have a look for the patch ?
 
> Hi,
> 
> This patch is for pr105735/pr101991. It will enable below optimization:
> {
> -  long unsigned int bit;
> -
> -   [local count: 32534376]:
> -
> -   [local count: 1041207449]:
> -  # tmp_10 = PHI 
> -  # bit_12 = PHI 
> -  tmp_7 = bit2_6(D) & tmp_10;
> -  bit_8 = bit_12 + 1;
> -  if (bit_8 != 32)
> -goto ; [96.97%]
> -  else
> -goto ; [3.03%]
> -
> -   [local count: 1009658865]:
> -  goto ; [100.00%]
> -
> -   [local count: 32534376]:
> -  # tmp_11 = PHI 
> -  return tmp_11;
> +  tmp_11 = tmp_4(D) & bit2_6(D);
> +  return tmp_11;
> 
> }
> 
> Ok for master ?
> 
> gcc/ChangeLog:
> 
>   PR middle-end/105735
>   * match.pd (bitop_with_inv_p): New match.
>   * tree-scalar-evolution.cc (gimple_bitop_with_inv_p): Declare.
>   (analyze_and_compute_bitop_with_inv_effect): New function.
>   (final_value_replacement_loop): Enhanced to handle bitop
>   with inv induction.
> 
> gcc/testsuite/ChangeLog:
> 
>   * gcc.target/i386/pr105735-1.c: New test.
>   * gcc.target/i386/pr105735-2.c: New test.
> ---
>  gcc/match.pd   |  4 +
>  gcc/testsuite/gcc.target/i386/pr105735-1.c | 88 ++
> gcc/testsuite/gcc.target/i386/pr105735-2.c | 28 +++
>  gcc/tree-scalar-evolution.cc   | 59 +++
>  4 files changed, 179 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr105735-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr105735-2.c
> 
> diff --git a/gcc/match.pd b/gcc/match.pd index 562138a8034..cfe593ebb02
> 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -8050,6 +8050,10 @@ and,
>   (bit_not
>(nop_convert1? (bit_xor@0 (convert2? (lshift integer_onep@1 @2)) @3
> 
> +(for bit_op (bit_and bit_ior bit_xor)
> + (match (bitop_with_inv_p @0 @1)
> +  (bit_op:c @0 @1)))
> +
>  /* n - (((n > C1) ? n : C1) & -C2) ->  n & C1 for unsigned case.
> n - (((n > C1) ? n : C1) & -C2) ->  (n <= C1) ? n : (n & C1) for signed 
> case.  */
> (simplify diff --git a/gcc/testsuite/gcc.target/i386/pr105735-1.c
> b/gcc/testsuite/gcc.target/i386/pr105735-1.c
> new file mode 100644
> index 000..8d2123ed351
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr105735-1.c
> @@ -0,0 +1,88 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O1 -fdump-tree-sccp-details" } */
> +/* { dg-final { scan-tree-dump-times {final value replacement} 8 "sccp"
> +} } */
> +
> +unsigned long long
> +__attribute__((noipa))
> +foo (unsigned long long tmp, unsigned long long bit2) {
> +  for (int bit = 0; bit < 64; bit++)
> +tmp &= bit2;
> +  return tmp;
> +}
> +
> +unsigned long long
> +__attribute__((noipa))
> +foo1 (unsigned long long tmp, unsigned long long bit2) {
> +  for (int bit = 63; bit >= 0; bit -=3)
> +tmp &= bit2;
> +  return tmp;
> +}
> +
> +unsigned long long
> +__attribute__((noipa))
> +foo2 (unsigned long long tmp, unsigned long long bit2) {
> +  for (int bit = 0; bit < 64; bit++)
> +tmp |= bit2;
> +  return tmp;
> +}
> +
> +unsigned long long
> +__attribute__((noipa))
> +foo3 (unsigned long long tmp, unsigned long long bit2) {
> +  for (int bit = 63; bit >= 0; bit -=3)
> +tmp |= bit2;
> +  return tmp;
> +}
> +
> +unsigned long long
> +__attribute__((noipa))
> +foo4 (unsigned long long tmp, unsigned long long bit2) {
> +  for (int bit = 0; bit < 64; bit++)
> +tmp ^= bit2;
> +  return tmp;
> +}
> +
> +unsigned long long
> +__attribute__((noipa))
> +foo5 (unsigned long long tmp, unsigned long long bit2) {
> +  for (int bit = 0; bit < 63; bit++)
> +tmp ^= bit2;
> +  return tmp;
> +}
> +
> +unsigned long long
> +__attribute__((noipa))
> +f (unsigned long long tmp, long long bit, unsigned long long bit2) {
> +  unsigned long long res = tmp;
> +  for (long long i = 0; i < bit; i++)
> +res &= bit2;
> +  return res;
> +}
> +
> +unsigned long long
> +__attribute__((noipa))
> +f1 (unsigned long long tmp, long long bit, unsigned long long bit2) {
> +  unsigned long long res = tmp;
> +  for (long long i = 0; i < bit; i++)
> +res |= bit2;
> +  return res;
> +}
> +
> +unsigned long long
> +__attribute__((noipa))
> +f2 (unsigned long long tmp, long long bit, unsigned long long bit2) {
> +  unsigned long long res = tmp;
> +  for (long long i = 0; i < bit; i++)
> +res ^= bit2;
> +  return res;
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/i386/pr105735-2.c
> b/gcc/testsuite/gcc.target/i386/pr105735-2.c
> new file mode 100644
> index 000..79c1d300b1b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr105735-2.c
> @@ -0,0 +1,28 @@
> +/* { dg-do run } */
> +/* { dg-options "-O1" } */
> +
> +#include "pr105735-1.c"
> +
> +int main()
> +{
> +  unsigned long long tmp = 0x1101101ULL;
> +  unsigned long long bit2 = 0x11100111ULL;
> +  if (foo (tmp, bit2) != 0x1100101ULL)
> +__builtin_abort ();
> +  if (foo1 (tmp, bit2) != 0x1100101ULL)
> +__builtin_abort ();
> +  if (foo2 (tmp, bit2) != 

[wwwdocs] [GCC13] Mention Intel __bf16 support.

2022-08-18 Thread Kong, Lingling via Gcc-patches
Hi

The patch is for mention Intel __bf16 support in gcc13.
Ok for master ?

Thanks,
Lingling

htdocs/gcc-13/changes.html | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/htdocs/gcc-13/changes.html b/htdocs/gcc-13/changes.html index 
57bd8724..7d98329c 100644
--- a/htdocs/gcc-13/changes.html
+++ b/htdocs/gcc-13/changes.html
@@ -122,7 +122,12 @@ a work-in-progress.
 
 
 
-
+IA-32/x86-64
+
+  For both C and C++ the __bf16 type is supported on
+  x86 systems with SSE2 and above enabled.
+  
+
 
 
 
--
2.18.2



[PATCH] Enhance final_value_replacement_loop to handle bitop with an invariant induction.[PR105735]

2022-08-18 Thread Kong, Lingling via Gcc-patches
Hi,

This patch is for pr105735/pr101991. It will enable below optimization:
{
-  long unsigned int bit;
-
-   [local count: 32534376]:
-
-   [local count: 1041207449]:
-  # tmp_10 = PHI 
-  # bit_12 = PHI 
-  tmp_7 = bit2_6(D) & tmp_10;
-  bit_8 = bit_12 + 1;
-  if (bit_8 != 32)
-goto ; [96.97%]
-  else
-goto ; [3.03%]
-
-   [local count: 1009658865]:
-  goto ; [100.00%]
-
-   [local count: 32534376]:
-  # tmp_11 = PHI 
-  return tmp_11;
+  tmp_11 = tmp_4(D) & bit2_6(D);
+  return tmp_11;

}

Ok for master ?

gcc/ChangeLog:

PR middle-end/105735
* match.pd (bitop_with_inv_p): New match.
* tree-scalar-evolution.cc (gimple_bitop_with_inv_p): Declare.
(analyze_and_compute_bitop_with_inv_effect): New function.
(final_value_replacement_loop): Enhanced to handle bitop
with inv induction.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr105735-1.c: New test.
* gcc.target/i386/pr105735-2.c: New test.
---
 gcc/match.pd   |  4 +
 gcc/testsuite/gcc.target/i386/pr105735-1.c | 88 ++  
gcc/testsuite/gcc.target/i386/pr105735-2.c | 28 +++
 gcc/tree-scalar-evolution.cc   | 59 +++
 4 files changed, 179 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr105735-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr105735-2.c

diff --git a/gcc/match.pd b/gcc/match.pd index 562138a8034..cfe593ebb02 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -8050,6 +8050,10 @@ and,
  (bit_not
   (nop_convert1? (bit_xor@0 (convert2? (lshift integer_onep@1 @2)) @3
 
+(for bit_op (bit_and bit_ior bit_xor)
+ (match (bitop_with_inv_p @0 @1)
+  (bit_op:c @0 @1)))
+
 /* n - (((n > C1) ? n : C1) & -C2) ->  n & C1 for unsigned case.
n - (((n > C1) ? n : C1) & -C2) ->  (n <= C1) ? n : (n & C1) for signed 
case.  */  (simplify diff --git a/gcc/testsuite/gcc.target/i386/pr105735-1.c 
b/gcc/testsuite/gcc.target/i386/pr105735-1.c
new file mode 100644
index 000..8d2123ed351
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr105735-1.c
@@ -0,0 +1,88 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -fdump-tree-sccp-details" } */
+/* { dg-final { scan-tree-dump-times {final value replacement} 8 "sccp" 
+} } */
+
+unsigned long long
+__attribute__((noipa))
+foo (unsigned long long tmp, unsigned long long bit2) {
+  for (int bit = 0; bit < 64; bit++)
+tmp &= bit2;
+  return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo1 (unsigned long long tmp, unsigned long long bit2) {
+  for (int bit = 63; bit >= 0; bit -=3)
+tmp &= bit2;
+  return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo2 (unsigned long long tmp, unsigned long long bit2) {
+  for (int bit = 0; bit < 64; bit++)
+tmp |= bit2;
+  return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo3 (unsigned long long tmp, unsigned long long bit2) {
+  for (int bit = 63; bit >= 0; bit -=3)
+tmp |= bit2;
+  return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo4 (unsigned long long tmp, unsigned long long bit2) {
+  for (int bit = 0; bit < 64; bit++)
+tmp ^= bit2;
+  return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo5 (unsigned long long tmp, unsigned long long bit2) {
+  for (int bit = 0; bit < 63; bit++)
+tmp ^= bit2;
+  return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+f (unsigned long long tmp, long long bit, unsigned long long bit2) {
+  unsigned long long res = tmp;
+  for (long long i = 0; i < bit; i++)
+res &= bit2;
+  return res;
+}
+
+unsigned long long
+__attribute__((noipa))
+f1 (unsigned long long tmp, long long bit, unsigned long long bit2) {
+  unsigned long long res = tmp;
+  for (long long i = 0; i < bit; i++)
+res |= bit2;
+  return res;
+}
+
+unsigned long long
+__attribute__((noipa))
+f2 (unsigned long long tmp, long long bit, unsigned long long bit2) {
+  unsigned long long res = tmp;
+  for (long long i = 0; i < bit; i++)
+res ^= bit2;
+  return res;
+}
+
diff --git a/gcc/testsuite/gcc.target/i386/pr105735-2.c 
b/gcc/testsuite/gcc.target/i386/pr105735-2.c
new file mode 100644
index 000..79c1d300b1b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr105735-2.c
@@ -0,0 +1,28 @@
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+
+#include "pr105735-1.c"
+
+int main()
+{
+  unsigned long long tmp = 0x1101101ULL;
+  unsigned long long bit2 = 0x11100111ULL;
+  if (foo (tmp, bit2) != 0x1100101ULL)
+__builtin_abort ();
+  if (foo1 (tmp, bit2) != 0x1100101ULL)
+__builtin_abort ();
+  if (foo2 (tmp, bit2) != 0x1110ULL)
+__builtin_abort ();
+  if (foo3 (tmp, bit2) != 0x1110ULL)
+__builtin_abort ();
+  if (foo4 (tmp, bit2) != 0x1101101ULL)
+__builtin_abort ();
+  if (foo5 (tmp, bit2) != 0x111010011010ULL)
+__builtin_abort ();
+  if (f (tmp, 64, bit2) != 0x1100101ULL)
+__builtin_abort ();
+  if (f1 (tmp, 64, bit2) != 0x1110ULL)
+

[PATCH] x86: Support vector __bf16 type.

2022-08-16 Thread Kong, Lingling via Gcc-patches
Hi,

The patch is support vector init/broadcast/set/extract for __bf16 type.
The __bf16 type is a storage type.

OK for master?

gcc/ChangeLog:

* config/i386/i386-expand.cc (ix86_expand_sse_movcc): Handle vector
BFmode.
(ix86_expand_vector_init_duplicate): Support vector BFmode.
(ix86_expand_vector_init_one_nonzero): Ditto.
(ix86_expand_vector_init_one_var): Ditto.
(ix86_expand_vector_init_concat): Ditto.
(ix86_expand_vector_init_interleave): Ditto.
(ix86_expand_vector_init_general): Ditto.
(ix86_expand_vector_init): Ditto.
(ix86_expand_vector_set_var): Ditto.
(ix86_expand_vector_set): Ditto.
(ix86_expand_vector_extract): Ditto.
* config/i386/i386.cc (classify_argument): Add BF vector modes.
(function_arg_64): Ditto.
(ix86_gimplify_va_arg): Ditto.
(ix86_get_ssemov): Ditto.
* config/i386/i386.h (VALID_AVX256_REG_MODE): Add BF vector modes.
(VALID_AVX512F_REG_MODE): Ditto.
(host_detect_local_cpu): Ditto.
(VALID_SSE2_REG_MODE): Ditto.
* config/i386/i386.md: Add BF vector modes.
(MODE_SIZE): Ditto.
(ssemodesuffix): Add bf suffix for BF vector modes.
(ssevecmode): Ditto.
* config/i386/sse.md (VMOVE): Adjust for BF vector modes.
(VI12HFBF_AVX512VL): Ditto.
(V_256_512): Ditto.
(VF_AVX512HFBF16): Ditto.
(VF_AVX512BWHFBF16): Ditto.
(VIHFBF): Ditto.
(avx512): Ditto.
(VIHFBF_256): Ditto.
(VIHFBF_AVX512BW): Ditto.
(VI2F_256_512):Ditto.
(V8_128):Ditto.
(V16_256): Ditto.
(V32_512): Ditto.
(sseinsnmode): Ditto.
(sseconstm1): Ditto.
(sseintmodesuffix): New mode_attr.
(avx512fmaskmode): Ditto.
(avx512fmaskmodelower): Ditto.
(ssedoublevecmode): Ditto.
(ssehalfvecmode): Ditto.
(ssehalfvecmodelower): Ditto.
(ssescalarmode): Add vector BFmode mapping.
(ssescalarmodelower): Ditto.
(ssexmmmode): Ditto.
(ternlogsuffix): Ditto.
(ssescalarsize): Ditto.
(sseintprefix): Ditto.
(i128): Ditto.
(xtg_mode): Ditto.
(bcstscalarsuff): Ditto.
(_blendm): New define_insn for BFmode.
(_store_mask): Ditto.
(vcond_mask_): Ditto.
(vec_set_0): New define_insn for BF vector set.
(V8BFH_128): New mode_iterator for BFmode.
(avx512fp16_mov): Ditto.
(vec_set): New define_insn for BF vector set.
(@vec_extract_hi_): Ditto.
(@vec_extract_lo_): Ditto.
(vec_set_hi_): Ditto.
(vec_set_lo_): Ditto.
(*vec_extract_0): New define_insn_and_split for BF
vector extract.
(*vec_extract): New define_insn.
(VEC_EXTRACT_MODE): Add BF vector modes.
(PINSR_MODE): Add V8BF.
(sse2p4_1): Ditto.
(pinsr_evex_isa): Ditto.
(_pinsr): Adjust to support
insert for V8BFmode.
(pbroadcast_evex_isa): Add BF vector modes.
(AVX2_VEC_DUP_MODE): Ditto.
(VEC_INIT_MODE): Ditto.
(VEC_INIT_HALF_MODE): Ditto.
(avx2_pbroadcast): Adjust to support BF vector mode
broadcast.
(avx2_pbroadcast_1): Ditto.
(_vec_dup_1): Ditto.
(_vec_dup_gpr):
Ditto.

gcc/testsuite/ChangeLog:

* g++.target/i386/vect-bfloat16-1.C: New test.
* gcc.target/i386/vect-bfloat16-1.c: New test.
* gcc.target/i386/vect-bfloat16-2a.c: New test.
* gcc.target/i386/vect-bfloat16-2b.c: New test.
* gcc.target/i386/vect-bfloat16-typecheck_1.c: New test.
* gcc.target/i386/vect-bfloat16-typecheck_2.c: New test.
---
 gcc/config/i386/i386-expand.cc| 129 +++--
 gcc/config/i386/i386.cc   |  16 +-
 gcc/config/i386/i386.h|  12 +-
 gcc/config/i386/i386.md   |   9 +-
 gcc/config/i386/sse.md| 211 --
 .../g++.target/i386/vect-bfloat16-1.C |  13 +
 .../gcc.target/i386/vect-bfloat16-1.c |  30 ++
 .../gcc.target/i386/vect-bfloat16-2a.c| 121 
 .../gcc.target/i386/vect-bfloat16-2b.c|  22 ++
 .../i386/vect-bfloat16-typecheck_1.c  | 258 ++
 .../i386/vect-bfloat16-typecheck_2.c  | 248 +
 11 files changed, 950 insertions(+), 119 deletions(-)
 create mode 100644 gcc/testsuite/g++.target/i386/vect-bfloat16-1.C
 create mode 100644 gcc/testsuite/gcc.target/i386/vect-bfloat16-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/vect-bfloat16-2a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/vect-bfloat16-2b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/vect-bfloat16-typecheck_1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/vect-bfloat16-typecheck_2.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 

RE: [PATCH] x86: Enable __bf16 type for TARGET_SSE2 and above

2022-08-03 Thread Kong, Lingling via Gcc-patches
Hi,

Old patch has some mistake in `*movbf_internal` , now disable BFmode constant 
double move in `*movbf_internal`.

Thanks,
Lingling

> -Original Message-
> From: Kong, Lingling 
> Sent: Tuesday, July 26, 2022 9:31 AM
> To: Liu, Hongtao ; gcc-patches@gcc.gnu.org
> Cc: Kong, Lingling 
> Subject: [PATCH] x86: Enable __bf16 type for TARGET_SSE2 and above
> 
> Hi,
> 
> The patch is enable __bf16 scalar type for target sse2 and above according to
> psABI(https://gitlab.com/x86-psABIs/x86-64-ABI/-/merge_requests/35/diffs).
> The __bf16 type is a storage type like arm.
> 
> OK for master?
> 
> gcc/ChangeLog:
> 
>   * config/i386/i386-builtin-types.def (BFLOAT16): New primitive type.
>   * config/i386/i386-builtins.cc : Support __bf16 type for i386 backend.
>   (ix86_register_bf16_builtin_type): New function.
>   (ix86_bf16_type_node): New.
>   (ix86_bf16_ptr_type_node): Ditto.
>   (ix86_init_builtin_types): Add ix86_register_bf16_builtin_type function
> call.
>   * config/i386/i386-modes.def (FLOAT_MODE): Add BFmode.
>   (ADJUST_FLOAT_FORMAT): Ditto.
>   * config/i386/i386.cc (merge_classes): Handle BFmode.
>   (classify_argument): Ditto.
>   (examine_argument): Ditto.
>   (construct_container): Ditto.
>   (function_value_32): Return __bf16 by %xmm0.
>   (function_value_64): Return __bf16 by SSE register.
>   (ix86_print_operand): Handle CONST_DOUBLE BFmode.
>   (ix86_secondary_reload): Require gpr as intermediate register
>   to store __bf16 from sse register when sse4 is not available.
>   (ix86_scalar_mode_supported_p): Enable __bf16 under sse2.
>   (ix86_mangle_type): Add manlging for __bf16 type.
>   (ix86_invalid_conversion): New function for target hook.
>   (ix86_invalid_unary_op): Ditto.
>   (ix86_invalid_binary_op): Ditto.
>   (TARGET_INVALID_CONVERSION): New define for target hook.
>   (TARGET_INVALID_UNARY_OP): Ditto.
>   (TARGET_INVALID_BINARY_OP): Ditto.
>   * config/i386/i386.h (host_detect_local_cpu): Add BFmode.
>   * config/i386/i386.md (*pushhf_rex64): Change for BFmode.
>   (*push_rex64): Ditto.
>   (*pushhf): Ditto.
>   (*push): Ditto.
>   (*movhf_internal): Ditto.
>   (*mov_internal): Ditto.
> 
> gcc/testsuite/ChangeLog:
> 
>   * g++.target/i386/bfloat_cpp_typecheck.C: New test.
>   * gcc.target/i386/bfloat16-1.c: Ditto.
>   * gcc.target/i386/sse2-bfloat16-1.c: Ditto.
>   * gcc.target/i386/sse2-bfloat16-2.c: Ditto.
>   * gcc.target/i386/sse2-bfloat16-scalar-typecheck.c: Ditto.
> ---
>  gcc/config/i386/i386-builtin-types.def|   1 +
>  gcc/config/i386/i386-builtins.cc  |  21 ++
>  gcc/config/i386/i386-modes.def|   2 +
>  gcc/config/i386/i386.cc   |  75 +-
>  gcc/config/i386/i386.h|   4 +-
>  gcc/config/i386/i386.md   |  32 +--
>  .../g++.target/i386/bfloat_cpp_typecheck.C|  10 +
>  gcc/testsuite/gcc.target/i386/bfloat16-1.c|  12 +
>  .../gcc.target/i386/sse2-bfloat16-1.c |   8 +
>  .../gcc.target/i386/sse2-bfloat16-2.c |  17 ++
>  .../i386/sse2-bfloat16-scalar-typecheck.c | 215 ++
>  11 files changed, 375 insertions(+), 22 deletions(-)  create mode 100644
> gcc/testsuite/g++.target/i386/bfloat_cpp_typecheck.C
>  create mode 100644 gcc/testsuite/gcc.target/i386/bfloat16-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/sse2-bfloat16-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/sse2-bfloat16-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/sse2-bfloat16-scalar-
> typecheck.c
> 
> diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-
> builtin-types.def
> index 7a2da1db0b0..63a360b0f8b 100644
> --- a/gcc/config/i386/i386-builtin-types.def
> +++ b/gcc/config/i386/i386-builtin-types.def
> @@ -69,6 +69,7 @@ DEF_PRIMITIVE_TYPE (UINT16,
> short_unsigned_type_node)  DEF_PRIMITIVE_TYPE (INT64,
> long_long_integer_type_node)  DEF_PRIMITIVE_TYPE (UINT64,
> long_long_unsigned_type_node)  DEF_PRIMITIVE_TYPE (FLOAT16,
> ix86_float16_type_node)
> +DEF_PRIMITIVE_TYPE (BFLOAT16, ix86_bf16_type_node)
>  DEF_PRIMITIVE_TYPE (FLOAT, float_type_node)  DEF_PRIMITIVE_TYPE
> (DOUBLE, double_type_node)  DEF_PRIMITIVE_TYPE (FLOAT80,
> float80_type_node) diff --git a/gcc/config/i386/i386-builtins.cc
> b/gcc/config/i386/i386-builtins.cc
> index fe7243c3837..6a04fb57e65 100644
> --- a/gcc/config/i386/i386-builtins.cc
> +++ b/gcc/config/i386/i386-builtins.cc
> @@ -126,6 +126,9 @@ BDESC_VERIFYS (IX86_BUILTIN_MAX,  static GTY(()) tree
> ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1]

[PATCH] x86: Enable __bf16 type for TARGET_SSE2 and above

2022-07-25 Thread Kong, Lingling via Gcc-patches
Hi,

The patch is enable __bf16 scalar type for target sse2 and above according to 
psABI(https://gitlab.com/x86-psABIs/x86-64-ABI/-/merge_requests/35/diffs).
The __bf16 type is a storage type like arm.

OK for master?

gcc/ChangeLog:

* config/i386/i386-builtin-types.def (BFLOAT16): New primitive type.
* config/i386/i386-builtins.cc : Support __bf16 type for i386 backend.
(ix86_register_bf16_builtin_type): New function.
(ix86_bf16_type_node): New.
(ix86_bf16_ptr_type_node): Ditto.
(ix86_init_builtin_types): Add ix86_register_bf16_builtin_type function 
call.
* config/i386/i386-modes.def (FLOAT_MODE): Add BFmode.
(ADJUST_FLOAT_FORMAT): Ditto.
* config/i386/i386.cc (merge_classes): Handle BFmode.
(classify_argument): Ditto.
(examine_argument): Ditto.
(construct_container): Ditto.
(function_value_32): Return __bf16 by %xmm0.
(function_value_64): Return __bf16 by SSE register.
(ix86_print_operand): Handle CONST_DOUBLE BFmode.
(ix86_secondary_reload): Require gpr as intermediate register
to store __bf16 from sse register when sse4 is not available.
(ix86_scalar_mode_supported_p): Enable __bf16 under sse2.
(ix86_mangle_type): Add manlging for __bf16 type.
(ix86_invalid_conversion): New function for target hook.
(ix86_invalid_unary_op): Ditto.
(ix86_invalid_binary_op): Ditto.
(TARGET_INVALID_CONVERSION): New define for target hook.
(TARGET_INVALID_UNARY_OP): Ditto.
(TARGET_INVALID_BINARY_OP): Ditto.
* config/i386/i386.h (host_detect_local_cpu): Add BFmode.
* config/i386/i386.md (*pushhf_rex64): Change for BFmode.
(*push_rex64): Ditto.
(*pushhf): Ditto.
(*push): Ditto.
(*movhf_internal): Ditto.
(*mov_internal): Ditto.

gcc/testsuite/ChangeLog:

* g++.target/i386/bfloat_cpp_typecheck.C: New test.
* gcc.target/i386/bfloat16-1.c: Ditto.
* gcc.target/i386/sse2-bfloat16-1.c: Ditto.
* gcc.target/i386/sse2-bfloat16-2.c: Ditto.
* gcc.target/i386/sse2-bfloat16-scalar-typecheck.c: Ditto.
---
 gcc/config/i386/i386-builtin-types.def|   1 +
 gcc/config/i386/i386-builtins.cc  |  21 ++
 gcc/config/i386/i386-modes.def|   2 +
 gcc/config/i386/i386.cc   |  75 +-
 gcc/config/i386/i386.h|   4 +-
 gcc/config/i386/i386.md   |  32 +--
 .../g++.target/i386/bfloat_cpp_typecheck.C|  10 +
 gcc/testsuite/gcc.target/i386/bfloat16-1.c|  12 +
 .../gcc.target/i386/sse2-bfloat16-1.c |   8 +
 .../gcc.target/i386/sse2-bfloat16-2.c |  17 ++
 .../i386/sse2-bfloat16-scalar-typecheck.c | 215 ++
 11 files changed, 375 insertions(+), 22 deletions(-)  create mode 100644 
gcc/testsuite/g++.target/i386/bfloat_cpp_typecheck.C
 create mode 100644 gcc/testsuite/gcc.target/i386/bfloat16-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse2-bfloat16-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse2-bfloat16-2.c
 create mode 100644 
gcc/testsuite/gcc.target/i386/sse2-bfloat16-scalar-typecheck.c

diff --git a/gcc/config/i386/i386-builtin-types.def 
b/gcc/config/i386/i386-builtin-types.def
index 7a2da1db0b0..63a360b0f8b 100644
--- a/gcc/config/i386/i386-builtin-types.def
+++ b/gcc/config/i386/i386-builtin-types.def
@@ -69,6 +69,7 @@ DEF_PRIMITIVE_TYPE (UINT16, short_unsigned_type_node)  
DEF_PRIMITIVE_TYPE (INT64, long_long_integer_type_node)  DEF_PRIMITIVE_TYPE 
(UINT64, long_long_unsigned_type_node)  DEF_PRIMITIVE_TYPE (FLOAT16, 
ix86_float16_type_node)
+DEF_PRIMITIVE_TYPE (BFLOAT16, ix86_bf16_type_node)
 DEF_PRIMITIVE_TYPE (FLOAT, float_type_node)  DEF_PRIMITIVE_TYPE (DOUBLE, 
double_type_node)  DEF_PRIMITIVE_TYPE (FLOAT80, float80_type_node) diff --git 
a/gcc/config/i386/i386-builtins.cc b/gcc/config/i386/i386-builtins.cc
index fe7243c3837..6a04fb57e65 100644
--- a/gcc/config/i386/i386-builtins.cc
+++ b/gcc/config/i386/i386-builtins.cc
@@ -126,6 +126,9 @@ BDESC_VERIFYS (IX86_BUILTIN_MAX,  static GTY(()) tree 
ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
 
 tree ix86_float16_type_node = NULL_TREE;
+tree ix86_bf16_type_node = NULL_TREE;
+tree ix86_bf16_ptr_type_node = NULL_TREE;
+
 /* Retrieve an element from the above table, building some of
the types lazily.  */
 
@@ -1366,6 +1369,22 @@ ix86_register_float16_builtin_type (void)
"_Float16");
 }
 
+static void
+ix86_register_bf16_builtin_type (void)
+{
+  ix86_bf16_type_node = make_node (REAL_TYPE);
+  TYPE_PRECISION (ix86_bf16_type_node) = 16;
+  SET_TYPE_MODE (ix86_bf16_type_node, BFmode);
+  layout_type (ix86_bf16_type_node);
+
+  if (!maybe_get_identifier ("__bf16") && TARGET_SSE2)
+{
+  lang_hooks.types.register_builtin_type (ix86_bf16_type_node,
+   

[PATCH] i386: Fix _mm_[u]comixx_{ss,sd} codegen and add PF result. [PR106113]

2022-07-14 Thread Kong, Lingling via Gcc-patches
Hi,

The patch is to fix _mm_[u]comixx_{ss,sd} codegen and add PF result.  These 
intrinsics have changed over time, like `_mm_comieq_ss ` old operation is 
`RETURN ( a[31:0] == b[31:0] ) ? 1 : 0`, and new operation update is `RETURN ( 
a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] == b[31:0] ) ? 1 : 0`.

OK for master?

gcc/ChangeLog:

PR target/106113
* config/i386/i386-builtin.def (BDESC): Fix [u]comi{ss,sd}
comparison due to intrinsics changed over time.
* config/i386/i386-expand.cc (ix86_ssecom_setcc):
Add unordered check and mode for sse comi codegen.
(ix86_expand_sse_comi): Add unordered check and check a different
CCmode.
(ix86_expand_sse_comi_round):Extract unordered check and mode part
in ix86_ssecom_setcc.

gcc/testsuite/ChangeLog:

PR target/106113
* gcc.target/i386/avx-vcomisd-pr106113-2.c: New test.
* gcc.target/i386/avx-vcomiss-pr106113-2.c: Ditto.
* gcc.target/i386/avx-vucomisd-pr106113-2.c: Ditto.
* gcc.target/i386/avx-vucomiss-pr106113-2.c: Ditto.
* gcc.target/i386/sse-comiss-pr106113-1.c: Ditto.
* gcc.target/i386/sse-comiss-pr106113-2.c: Ditto.
* gcc.target/i386/sse-ucomiss-pr106113-1.c: Ditto.
* gcc.target/i386/sse-ucomiss-pr106113-2.c: Ditto.
* gcc.target/i386/sse2-comisd-pr106113-1.c: Ditto.
* gcc.target/i386/sse2-comisd-pr106113-2.c: Ditto.
* gcc.target/i386/sse2-ucomisd-pr106113-1.c: Ditto.
* gcc.target/i386/sse2-ucomisd-pr106113-2.c: Ditto.
---
 gcc/config/i386/i386-builtin.def  |  32 ++--
 gcc/config/i386/i386-expand.cc| 140 +++---
 .../gcc.target/i386/avx-vcomisd-pr106113-2.c  |   8 +
 .../gcc.target/i386/avx-vcomiss-pr106113-2.c  |   8 +
 .../gcc.target/i386/avx-vucomisd-pr106113-2.c |   8 +
 .../gcc.target/i386/avx-vucomiss-pr106113-2.c |   8 +
 .../gcc.target/i386/sse-comiss-pr106113-1.c   |  19 +++
 .../gcc.target/i386/sse-comiss-pr106113-2.c   |  59 
 .../gcc.target/i386/sse-ucomiss-pr106113-1.c  |  19 +++
 .../gcc.target/i386/sse-ucomiss-pr106113-2.c  |  59 
 .../gcc.target/i386/sse2-comisd-pr106113-1.c  |  19 +++
 .../gcc.target/i386/sse2-comisd-pr106113-2.c  |  59 
 .../gcc.target/i386/sse2-ucomisd-pr106113-1.c |  19 +++
 .../gcc.target/i386/sse2-ucomisd-pr106113-2.c |  59 
 14 files changed, 450 insertions(+), 66 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx-vcomisd-pr106113-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx-vcomiss-pr106113-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx-vucomisd-pr106113-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx-vucomiss-pr106113-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse-comiss-pr106113-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse-comiss-pr106113-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse-ucomiss-pr106113-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse-ucomiss-pr106113-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse2-comisd-pr106113-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse2-comisd-pr106113-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse2-ucomisd-pr106113-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse2-ucomisd-pr106113-2.c

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index fd160935e67..acb7e8ca64b 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -35,30 +35,30 @@
 IX86_BUILTIN__BDESC_##NEXT_KIND##_FIRST - 1.  */
 
 BDESC_FIRST (comi, COMI,
-   OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_comi, "__builtin_ia32_comieq", 
IX86_BUILTIN_COMIEQSS, UNEQ, 0)
-BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_comi, "__builtin_ia32_comilt", 
IX86_BUILTIN_COMILTSS, UNLT, 0)
-BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_comi, "__builtin_ia32_comile", 
IX86_BUILTIN_COMILESS, UNLE, 0)
+   OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_comi, "__builtin_ia32_comieq", 
IX86_BUILTIN_COMIEQSS, EQ, 0)
+BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_comi, "__builtin_ia32_comilt", 
IX86_BUILTIN_COMILTSS, LT, 0)
+BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_comi, "__builtin_ia32_comile", 
IX86_BUILTIN_COMILESS, LE, 0)
 BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_comi, "__builtin_ia32_comigt", 
IX86_BUILTIN_COMIGTSS, GT, 0)
 BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_comi, "__builtin_ia32_comige", 
IX86_BUILTIN_COMIGESS, GE, 0)
-BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_comi, "__builtin_ia32_comineq", 
IX86_BUILTIN_COMINEQSS, LTGT, 0)
-BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", 
IX86_BUILTIN_UCOMIEQSS, UNEQ, 0)
-BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", 
IX86_BUILTIN_UCOMILTSS, UNLT, 0)
-BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", 
IX86_BUILTIN_UCOMILESS, UNLE, 0)
+BDESC (OPTION_MASK_ISA_SSE, 0, 

RE: [PATCH] MAINTAINERS: Add myself for write after approval

2022-06-27 Thread Kong, Lingling via Gcc-patches
Thanks a lot! I fixed it.

ChangeLog:

* MAINTAINERS (Write After Approval): Add myself.
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 54d8ad41a6f..151770f59f4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -478,6 +478,7 @@ Jeff Knaggs 

 Michael Koch   
 Nicolas Koenig 
 Boris Kolpackov

+Lingling Kong  
 Dave Korn  
 Julia Koval
 Matt Kraai 
-- 
2.18.2

> -Original Message-
> From: Hongyu Wang 
> Sent: Monday, June 27, 2022 4:32 PM
> To: Kong, Lingling 
> Cc: Liu, Hongtao ; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH] MAINTAINERS: Add myself for write after approval
> 
> Sorry, should be between
> 
> Boris Kolpackov  Dave Korn
> 
> 
> Hongyu Wang  于2022年6月27日周一 16:29
> 写道:
> >
> > According to the official guide, please sort your last name in
> > alphabetical order, which means you shold put your name between
> >
> > Dave Korn  Julia Koval
> > 
> >
> > Kong, Lingling via Gcc-patches  于2022年6月27
> 日周一
> > 16:05写道:
> >
> > >
> > > Hi,
> > >
> > > I want to add myself in MAINTANINER for write after approval.
> > >
> > > OK for master?
> > >
> > > ChangeLog:
> > >
> > > * MAINTAINERS (Write After Approval): Add myself.
> > > ---
> > >  MAINTAINERS | 1 +
> > >  1 file changed, 1 insertion(+)
> > >
> > > diff --git a/MAINTAINERS b/MAINTAINERS index
> > > 54d8ad41a6f..49627e5d113 100644
> > > --- a/MAINTAINERS
> > > +++ b/MAINTAINERS
> > > @@ -698,6 +698,7 @@ Shujing Zhao
> 
> > >  Jon Ziegler
> > >  Roman Zippel   
> > >  Josef Zlomek   
> > > +Lingling Kong  
> > >
> > > Bug database only accounts
> > >
> > > --
> > > 2.18.1
> > >


[PATCH] MAINTAINERS: Add myself for write after approval

2022-06-27 Thread Kong, Lingling via Gcc-patches
Hi,

I want to add myself in MAINTANINER for write after approval.

OK for master?

ChangeLog:

* MAINTAINERS (Write After Approval): Add myself.
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 54d8ad41a6f..49627e5d113 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -698,6 +698,7 @@ Shujing Zhao

 Jon Ziegler
 Roman Zippel   
 Josef Zlomek   
+Lingling Kong  
 
Bug database only accounts
 
-- 
2.18.1



[PATCH] i386: Enable intrinsics that convert float and bf16 data to each other.

2021-12-21 Thread Kong, Lingling via Gcc-patches
Hi,


This patch is to enable intrinsics that convert float and bf16 data to each 
other.
Ok for master?

gcc/ChangeLog:

* config/i386/avx512bf16intrin.h (_mm_cvtsbh_ss): Add new intrinsic.
(_mm512_cvtpbh_ps): Likewise.
(_mm512_maskz_cvtpbh_ps): Likewise.
(_mm512_mask_cvtpbh_ps): Likewise.
* config/i386/avx512bf16vlintrin.h (_mm_cvtness_sbh): Likewise.
(_mm_cvtpbh_ps): Likewise.
(_mm256_cvtpbh_ps): Likewise.
(_mm_maskz_cvtpbh_ps): Likewise.
(_mm256_maskz_cvtpbh_ps): Likewise.
(_mm_mask_cvtpbh_ps): Likewise.
(_mm256_mask_cvtpbh_ps): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512bf16-cvtsbh2ss-1.c: New test.
* gcc.target/i386/avx512bf16-vcvtpbh2ps-1.c: Ditto.
* gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c: Ditto.
* gcc.target/i386/avx512bf16vl-vcvtpbh2ps-1.c: Ditto.
---
 gcc/config/i386/avx512bf16intrin.h| 36 +++
 gcc/config/i386/avx512bf16vlintrin.h  | 63 +++
 .../gcc.target/i386/avx512bf16-cvtsbh2ss-1.c  | 15 +  
.../gcc.target/i386/avx512bf16-vcvtpbh2ps-1.c | 20 ++
 .../i386/avx512bf16vl-cvtness2sbh-1.c | 14 +
 .../i386/avx512bf16vl-vcvtpbh2ps-1.c  | 29 +
 6 files changed, 177 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512bf16-cvtsbh2ss-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512bf16-vcvtpbh2ps-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtpbh2ps-1.c

diff --git a/gcc/config/i386/avx512bf16intrin.h 
b/gcc/config/i386/avx512bf16intrin.h
index 9afc6bd7d2b..6b62dc3e398 100644
--- a/gcc/config/i386/avx512bf16intrin.h
+++ b/gcc/config/i386/avx512bf16intrin.h
@@ -41,6 +41,16 @@ typedef short __v32bh __attribute__ ((__vector_size__ (64)));
vector types, and their scalar components.  */  typedef short __m512bh 
__attribute__ ((__vector_size__ (64), __may_alias__));
 
+/* Convert One BF16 Data to One Single Float Data.  */ extern __inline 
+float __attribute__ ((__gnu_inline__, __always_inline__, 
+__artificial__)) _mm_cvtsbh_ss (__bfloat16 __A) {
+  union{ float a; unsigned int b;} __tmp;
+  __tmp.b = ((unsigned int)(__A)) << 16;
+  return __tmp.a;
+}
+
 /* vcvtne2ps2bf16 */
 
 extern __inline __m512bh
@@ -110,6 +120,32 @@ _mm512_maskz_dpbf16_ps (__mmask16 __A, __m512 __B, 
__m512bh __C, __m512bh __D)
   return (__m512)__builtin_ia32_dpbf16ps_v16sf_maskz(__B, __C, __D, __A);  }
 
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 
+_mm512_cvtpbh_ps (__m256bh __A) {
+  return (__m512)_mm512_castsi512_ps ((__m512i)_mm512_slli_epi32 (
+(__m512i)_mm512_cvtepi16_epi32 ((__m256i)__A), 16)); }
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 
+_mm512_maskz_cvtpbh_ps (__mmask16 __U, __m256bh __A) {
+  return (__m512)_mm512_castsi512_ps ((__m512i) _mm512_slli_epi32 (
+(__m512i)_mm512_maskz_cvtepi16_epi32 (
+(__mmask16)__U, (__m256i)__A), 16));
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 
+_mm512_mask_cvtpbh_ps (__m512 __S, __mmask16 __U, __m256bh __A) {
+  return (__m512)_mm512_castsi512_ps ((__m512i)(_mm512_mask_slli_epi32 (
+(__m512i)__S, (__mmask16)__U,
+(__m512i)_mm512_cvtepi16_epi32 ((__m256i)__A), 16))); }
+
 #ifdef __DISABLE_AVX512BF16__
 #undef __DISABLE_AVX512BF16__
 #pragma GCC pop_options
diff --git a/gcc/config/i386/avx512bf16vlintrin.h 
b/gcc/config/i386/avx512bf16vlintrin.h
index 6dd396d4008..5e6a6503aa6 100644
--- a/gcc/config/i386/avx512bf16vlintrin.h
+++ b/gcc/config/i386/avx512bf16vlintrin.h
@@ -43,6 +43,7 @@ typedef short __v8bh __attribute__ ((__vector_size__ (16)));  
typedef short __m256bh __attribute__ ((__vector_size__ (32), __may_alias__));  
typedef short __m128bh __attribute__ ((__vector_size__ (16), __may_alias__));
 
+typedef unsigned short __bfloat16;
 /* vcvtne2ps2bf16 */
 
 extern __inline __m256bh
@@ -175,6 +176,68 @@ _mm_maskz_dpbf16_ps (__mmask8 __A, __m128 __B, __m128bh 
__C, __m128bh __D)
   return (__m128)__builtin_ia32_dpbf16ps_v4sf_maskz(__B, __C, __D, __A);  }
 
+extern __inline __bfloat16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 
+_mm_cvtness_sbh (float __A) {
+  __v4sf __V = {__A, 0, 0, 0};
+  __v8hi __R = __builtin_ia32_cvtneps2bf16_v4sf_mask ((__v4sf)__V,
+  (__v8hi)_mm_undefined_si128 (), (__mmask8)-1);
+  return __R[0];
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 
+_mm_cvtpbh_ps (__m128bh __A) {
+  return (__m128)_mm_castsi128_ps ((__m128i)_mm_slli_epi32 (
+(__m128i)_mm_cvtepi16_epi32 ((__m128i)__A), 16)); }
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 
+_mm256_cvtpbh_ps (__m128bh __A) {
+ 

RE: [PATCH] i386: vcvtph2ps and vcvtps2ph should be used to convert _Float16 to SFmode with -mf16c [PR 102811]

2021-11-24 Thread Kong, Lingling via Gcc-patches
OK, This is the patch I prepare to check in.

-Original Message-
From: Uros Bizjak  
Sent: Wednesday, November 24, 2021 4:49 PM
To: Kong, Lingling 
Cc: Liu, Hongtao ; gcc-patches@gcc.gnu.org
Subject: Re: [PATCH] i386: vcvtph2ps and vcvtps2ph should be used to convert 
_Float16 to SFmode with -mf16c [PR 102811]

On Wed, Nov 24, 2021 at 9:44 AM Kong, Lingling  wrote:
>
> Hi,
>
> vcvtph2ps and vcvtps2ph should be used to convert _Float16 to SFmode with 
> -mf16c. So added define_insn extendhfsf2 and truncsfhf2 for target_f16c.
> Cleared before conversion, updated  movhi_internal and 
> ix86_can_change_mode_class. And fixed some commit message.
>
> OK for master?

OK, with a small adjustment to ChangeLog.

Thanks,
Uros.

> gcc/ChangeLog:
>
> PR target/102811
> * config/i386/i386.c (ix86_can_change_mode_class): Allow 16 bit data 
> in XMM register
> for TARGET_SSE2.
> * config/i386/i386.md (extendhfsf2): Add extenndhfsf2 for TARGET_F16C.
> (extendhfdf2): Restrict extendhfdf for TARGET_AVX512FP16 only.
> (*extendhf2): Rename from extendhf2.
> (truncsfhf2): Likewise.
> (truncdfhf2): Likewise.
> (*trunc2): Likewise.
>
> gcc/testsuite/ChangeLog:
>
> PR target/102811
> * gcc.target/i386/pr90773-21.c: Optimize movhi_internal,
> also allow pextrw replace vmovd + movw.

Just write:

* gcc.target/i386/pr90773-21.c: Allow pextrw instead of movw.

> * gcc.target/i386/pr90773-23.c: Ditto.
> * gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c: New test.
> ---
>  gcc/config/i386/i386.c|  5 +-
>  gcc/config/i386/i386.md   | 74 +--
>  .../i386/avx512vl-vcvtps2ph-pr102811.c| 11 +++
>  gcc/testsuite/gcc.target/i386/pr90773-21.c|  2 +-
>  gcc/testsuite/gcc.target/i386/pr90773-23.c|  2 +-
>  5 files changed, 83 insertions(+), 11 deletions(-)  create mode 
> 100644 gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 
> e94efdf39fb..4b813533961 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -19485,9 +19485,8 @@ ix86_can_change_mode_class (machine_mode from, 
> machine_mode to,
>  disallow a change to these modes, reload will assume it's ok to
>  drop the subreg from (subreg:SI (reg:HI 100) 0).  This affects
>  the vec_dupv4hi pattern.
> -NB: AVX512FP16 supports vmovw which can load 16bit data to sse
> -register.  */
> -  int mov_size = MAYBE_SSE_CLASS_P (regclass) && TARGET_AVX512FP16 ? 2 : 
> 4;
> +NB: SSE2 can load 16bit data to sse register via pinsrw.  */
> +  int mov_size = MAYBE_SSE_CLASS_P (regclass) && TARGET_SSE2 ? 2 :
> +4;
>if (GET_MODE_SIZE (from) < mov_size)
> return false;
>  }
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
> 6eb9de81921..6ee264f1151 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -2525,6 +2525,16 @@
>  case TYPE_SSEMOV:
>return ix86_output_ssemov (insn, operands);
>
> +case TYPE_SSELOG:
> +  if (SSE_REG_P (operands[0]))
> +   return MEM_P (operands[1])
> + ? "pinsrw\t{$0, %1, %0|%0, %1, 0}"
> + : "pinsrw\t{$0, %k1, %0|%0, %k1, 0}";
> +  else
> +   return MEM_P (operands[1])
> + ? "pextrw\t{$0, %1, %0|%0, %1, 0}"
> + : "pextrw\t{$0, %1, %k0|%k0, %k1, 0}";
> +
>  case TYPE_MSKLOG:
>if (operands[1] == const0_rtx)
> return "kxorw\t%0, %0, %0";
> @@ -2540,13 +2550,17 @@
>  }
>  }
>[(set (attr "isa")
> -   (cond [(eq_attr "alternative" "9,10,11,12,13")
> - (const_string "avx512fp16")
> +   (cond [(eq_attr "alternative" "9,10,11,12")
> + (const_string "sse2")
> +  (eq_attr "alternative" "13")
> + (const_string "sse4")
>]
>(const_string "*")))
> (set (attr "type")
>   (cond [(eq_attr "alternative" "9,10,11,12,13")
> - (const_string "ssemov")
> + (if_then_else (match_test "TARGET_AVX512FP16")
> +   (const_string "ssemov")
> +   (const_string "sselog"))
> (eq_attr "alternative" "4,5,6,7")
>   (const_string "mskmov")
> (eq_attr 

[PATCH] i386: vcvtph2ps and vcvtps2ph should be used to convert _Float16 to SFmode with -mf16c [PR 102811]

2021-11-24 Thread Kong, Lingling via Gcc-patches
Hi,

vcvtph2ps and vcvtps2ph should be used to convert _Float16 to SFmode with 
-mf16c. So added define_insn extendhfsf2 and truncsfhf2 for target_f16c.
Cleared before conversion, updated  movhi_internal and 
ix86_can_change_mode_class. And fixed some commit message.

OK for master?

gcc/ChangeLog:

PR target/102811
* config/i386/i386.c (ix86_can_change_mode_class): Allow 16 bit data in 
XMM register
for TARGET_SSE2.
* config/i386/i386.md (extendhfsf2): Add extenndhfsf2 for TARGET_F16C.
(extendhfdf2): Restrict extendhfdf for TARGET_AVX512FP16 only.
(*extendhf2): Rename from extendhf2.
(truncsfhf2): Likewise.
(truncdfhf2): Likewise.
(*trunc2): Likewise.

gcc/testsuite/ChangeLog:

PR target/102811
* gcc.target/i386/pr90773-21.c: Optimize movhi_internal,
also allow pextrw replace vmovd + movw.
* gcc.target/i386/pr90773-23.c: Ditto.
* gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c: New test.
---
 gcc/config/i386/i386.c|  5 +-
 gcc/config/i386/i386.md   | 74 +--
 .../i386/avx512vl-vcvtps2ph-pr102811.c| 11 +++
 gcc/testsuite/gcc.target/i386/pr90773-21.c|  2 +-
 gcc/testsuite/gcc.target/i386/pr90773-23.c|  2 +-
 5 files changed, 83 insertions(+), 11 deletions(-)  create mode 100644 
gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 
e94efdf39fb..4b813533961 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -19485,9 +19485,8 @@ ix86_can_change_mode_class (machine_mode from, 
machine_mode to,
 disallow a change to these modes, reload will assume it's ok to
 drop the subreg from (subreg:SI (reg:HI 100) 0).  This affects
 the vec_dupv4hi pattern.
-NB: AVX512FP16 supports vmovw which can load 16bit data to sse
-register.  */
-  int mov_size = MAYBE_SSE_CLASS_P (regclass) && TARGET_AVX512FP16 ? 2 : 4;
+NB: SSE2 can load 16bit data to sse register via pinsrw.  */
+  int mov_size = MAYBE_SSE_CLASS_P (regclass) && TARGET_SSE2 ? 2 : 
+4;
   if (GET_MODE_SIZE (from) < mov_size)
return false;
 }
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
6eb9de81921..6ee264f1151 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -2525,6 +2525,16 @@
 case TYPE_SSEMOV:
   return ix86_output_ssemov (insn, operands);
 
+case TYPE_SSELOG:
+  if (SSE_REG_P (operands[0]))
+   return MEM_P (operands[1])
+ ? "pinsrw\t{$0, %1, %0|%0, %1, 0}"
+ : "pinsrw\t{$0, %k1, %0|%0, %k1, 0}";
+  else
+   return MEM_P (operands[1])
+ ? "pextrw\t{$0, %1, %0|%0, %1, 0}"
+ : "pextrw\t{$0, %1, %k0|%k0, %k1, 0}";
+
 case TYPE_MSKLOG:
   if (operands[1] == const0_rtx)
return "kxorw\t%0, %0, %0";
@@ -2540,13 +2550,17 @@
 }
 }
   [(set (attr "isa")
-   (cond [(eq_attr "alternative" "9,10,11,12,13")
- (const_string "avx512fp16")
+   (cond [(eq_attr "alternative" "9,10,11,12")
+ (const_string "sse2")
+  (eq_attr "alternative" "13")
+ (const_string "sse4")
   ]
   (const_string "*")))
(set (attr "type")
  (cond [(eq_attr "alternative" "9,10,11,12,13")
- (const_string "ssemov")
+ (if_then_else (match_test "TARGET_AVX512FP16")
+   (const_string "ssemov")
+   (const_string "sselog"))
(eq_attr "alternative" "4,5,6,7")
  (const_string "mskmov")
(eq_attr "alternative" "8")
@@ -4574,8 +4588,32 @@
   emit_move_insn (operands[0], CONST0_RTX (V2DFmode));
 })
 
-(define_insn "extendhf2"
-  [(set (match_operand:MODEF 0 "nonimm_ssenomem_operand" "=v")
+(define_expand "extendhfsf2"
+  [(set (match_operand:SF 0 "register_operand")
+   (float_extend:SF
+ (match_operand:HF 1 "nonimmediate_operand")))]
+  "TARGET_AVX512FP16 || TARGET_F16C || TARGET_AVX512VL"
+{
+  if (!TARGET_AVX512FP16)
+{
+  rtx res = gen_reg_rtx (V4SFmode);
+  rtx tmp = force_reg (V8HFmode, CONST0_RTX (V8HFmode));
+
+  ix86_expand_vector_set (false, tmp, operands[1], 0);
+  emit_insn (gen_vcvtph2ps (res, gen_lowpart (V8HImode, tmp)));
+  emit_move_insn (operands[0], gen_lowpart (SFmode, res));
+  DONE;
+}
+})
+
+(define_expand "extendhfdf2"
+  [(set (match_operand:DF 0 "register_operand")
+   (float_extend:DF
+ (match_operand:HF 1 "nonimmediate_operand")))]
+  "TARGET_AVX512FP16")
+
+(define_insn "*extendhf2"
+  [(set (match_operand:MODEF 0 "register_operand" "=v")
 (float_extend:MODEF
  (match_operand:HF 1 "nonimmediate_operand" "vm")))]
   "TARGET_AVX512FP16"
@@ -4766,7 +4804,31 @@
 
 ;; Conversion from {SF,DF}mode to HFmode.
 
-(define_insn "trunchf2"
+(define_expand "truncsfhf2"
+  [(set 

RE: [PATCH] i386: vcvtph2ps and vcvtps2ph should be used to convert _Float16 to SFmode with -mf16c [PR 102811]

2021-11-24 Thread Kong, Lingling via Gcc-patches
Hi  Uros,

> BTW: When playing with my patch, I introduced (define_insn "*vec_set_0" 
> ...) to optimize scalar load to a vector. Does ix86_expand_vector_set work OK 
> without this pattern?

Yes, ix86_expand_vector_set could work ok with (define_insn 
"_pinsr"), this insn can optimize scalar load to a 
vector.

Thanks,
Lingling

-Original Message-
From: Uros Bizjak  
Sent: Wednesday, November 24, 2021 3:57 PM
To: Kong, Lingling 
Cc: Liu, Hongtao ; gcc-patches@gcc.gnu.org
Subject: Re: [PATCH] i386: vcvtph2ps and vcvtps2ph should be used to convert 
_Float16 to SFmode with -mf16c [PR 102811]

On Wed, Nov 24, 2021 at 7:25 AM Kong, Lingling via Gcc-patches 
 wrote:
>
> Hi,
>
> vcvtph2ps and vcvtps2ph should be used to convert _Float16 to SFmode with 
> -mf16c. So added define_insn extendhfsf2 and truncsfhf2 for target_f16c.
> And cleared before conversion, updated  movhi_internal and 
> ix86_can_change_mode_class.

Please fix the above commit message.

>
> OK for master?
>
> gcc/ChangeLog:
>
> PR target/102811
> * config/i386/i386.c (ix86_can_change_mode_class): SSE2 can load 
> 16bit data
> to sse register via pinsrw.

Allow 16bit data in XMM register for SSE2 targets.

> * config/i386/i386.md (extendhfsf2): Add extenndhfsf2 for f16c.

... for TARGET_F16C.

> (extendhfdf2): Split extendhf2 into separate extendhfsf2, 
> extendhfdf2.
> extendhfdf only for target_avx512fp16.

Restrict extendhfdf for TARGET_AVX512FP16 only.

> (*extendhf2):rename extendhf2.

Rename from extendhf2.

> (truncsfhf2): Likewise.
> (truncdfhf2): Likewise.
> (*trunc2): Likewise.
>
> gcc/testsuite/ChangeLog:
>
> PR target/102811
> * gcc.target/i386/pr90773-21.c: Optimized movhi_internal,
> optimize vmovd + movw to vpextrw.

Also allow pextrw.

> * gcc.target/i386/pr90773-23.c: Ditto.
> * gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c: New test.

Otherwise LGTM.

BTW: When playing with my patch, I introduced (define_insn "*vec_set_0" 
...) to optimize scalar load to a vector. Does ix86_expand_vector_set work OK 
without this pattern?

Thanks,
Uros.

> ---
>  gcc/config/i386/i386.c|  5 +-
>  gcc/config/i386/i386.md   | 74 +--
>  .../i386/avx512vl-vcvtps2ph-pr102811.c| 11 +++
>  gcc/testsuite/gcc.target/i386/pr90773-21.c|  2 +-
>  gcc/testsuite/gcc.target/i386/pr90773-23.c|  2 +-
>  5 files changed, 83 insertions(+), 11 deletions(-)  create mode 
> 100644 gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 
> e94efdf39fb..4b813533961 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -19485,9 +19485,8 @@ ix86_can_change_mode_class (machine_mode from, 
> machine_mode to,
>  disallow a change to these modes, reload will assume it's ok to
>  drop the subreg from (subreg:SI (reg:HI 100) 0).  This affects
>  the vec_dupv4hi pattern.
> -NB: AVX512FP16 supports vmovw which can load 16bit data to sse
> -register.  */
> -  int mov_size = MAYBE_SSE_CLASS_P (regclass) && TARGET_AVX512FP16 ? 2 : 
> 4;
> +NB: SSE2 can load 16bit data to sse register via pinsrw.  */
> +  int mov_size = MAYBE_SSE_CLASS_P (regclass) && TARGET_SSE2 ? 2 :
> +4;
>if (GET_MODE_SIZE (from) < mov_size)
> return false;
>  }
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
> 6eb9de81921..6ee264f1151 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -2525,6 +2525,16 @@
>  case TYPE_SSEMOV:
>return ix86_output_ssemov (insn, operands);
>
> +case TYPE_SSELOG:
> +  if (SSE_REG_P (operands[0]))
> +   return MEM_P (operands[1])
> + ? "pinsrw\t{$0, %1, %0|%0, %1, 0}"
> + : "pinsrw\t{$0, %k1, %0|%0, %k1, 0}";
> +  else
> +   return MEM_P (operands[1])
> + ? "pextrw\t{$0, %1, %0|%0, %1, 0}"
> + : "pextrw\t{$0, %1, %k0|%k0, %k1, 0}";
> +
>  case TYPE_MSKLOG:
>if (operands[1] == const0_rtx)
> return "kxorw\t%0, %0, %0";
> @@ -2540,13 +2550,17 @@
>  }
>  }
>[(set (attr "isa")
> -   (cond [(eq_attr "alternative" "9,10,11,12,13")
> - (const_string "avx512fp16")
> +   (cond [(eq_attr "alternative" "9,10,11,12")
> + (const_string "sse2")
> +  (eq_attr "alternative" 

RE: [PATCH] i386: vcvtph2ps and vcvtps2ph should be used to convert _Float16 to SFmode with -mf16c [PR 102811]

2021-11-23 Thread Kong, Lingling via Gcc-patches
Hi,

vcvtph2ps and vcvtps2ph should be used to convert _Float16 to SFmode with 
-mf16c. So added define_insn extendhfsf2 and truncsfhf2 for target_f16c.
And cleared before conversion, updated  movhi_internal and 
ix86_can_change_mode_class.

OK for master?

gcc/ChangeLog:

PR target/102811
* config/i386/i386.c (ix86_can_change_mode_class): SSE2 can load 16bit 
data
to sse register via pinsrw.
* config/i386/i386.md (extendhfsf2): Add extenndhfsf2 for f16c.
(extendhfdf2): Split extendhf2 into separate extendhfsf2, 
extendhfdf2.
extendhfdf only for target_avx512fp16.
(*extendhf2):rename extendhf2.
(truncsfhf2): Likewise.
(truncdfhf2): Likewise.
(*trunc2): Likewise.

gcc/testsuite/ChangeLog:

PR target/102811
* gcc.target/i386/pr90773-21.c: Optimized movhi_internal,
optimize vmovd + movw to vpextrw.
* gcc.target/i386/pr90773-23.c: Ditto.
* gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c: New test.
---
 gcc/config/i386/i386.c|  5 +-
 gcc/config/i386/i386.md   | 74 +--
 .../i386/avx512vl-vcvtps2ph-pr102811.c| 11 +++
 gcc/testsuite/gcc.target/i386/pr90773-21.c|  2 +-
 gcc/testsuite/gcc.target/i386/pr90773-23.c|  2 +-
 5 files changed, 83 insertions(+), 11 deletions(-)  create mode 100644 
gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 
e94efdf39fb..4b813533961 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -19485,9 +19485,8 @@ ix86_can_change_mode_class (machine_mode from, 
machine_mode to,
 disallow a change to these modes, reload will assume it's ok to
 drop the subreg from (subreg:SI (reg:HI 100) 0).  This affects
 the vec_dupv4hi pattern.
-NB: AVX512FP16 supports vmovw which can load 16bit data to sse
-register.  */
-  int mov_size = MAYBE_SSE_CLASS_P (regclass) && TARGET_AVX512FP16 ? 2 : 4;
+NB: SSE2 can load 16bit data to sse register via pinsrw.  */
+  int mov_size = MAYBE_SSE_CLASS_P (regclass) && TARGET_SSE2 ? 2 : 
+4;
   if (GET_MODE_SIZE (from) < mov_size)
return false;
 }
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
6eb9de81921..6ee264f1151 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -2525,6 +2525,16 @@
 case TYPE_SSEMOV:
   return ix86_output_ssemov (insn, operands);
 
+case TYPE_SSELOG:
+  if (SSE_REG_P (operands[0]))
+   return MEM_P (operands[1])
+ ? "pinsrw\t{$0, %1, %0|%0, %1, 0}"
+ : "pinsrw\t{$0, %k1, %0|%0, %k1, 0}";
+  else
+   return MEM_P (operands[1])
+ ? "pextrw\t{$0, %1, %0|%0, %1, 0}"
+ : "pextrw\t{$0, %1, %k0|%k0, %k1, 0}";
+
 case TYPE_MSKLOG:
   if (operands[1] == const0_rtx)
return "kxorw\t%0, %0, %0";
@@ -2540,13 +2550,17 @@
 }
 }
   [(set (attr "isa")
-   (cond [(eq_attr "alternative" "9,10,11,12,13")
- (const_string "avx512fp16")
+   (cond [(eq_attr "alternative" "9,10,11,12")
+ (const_string "sse2")
+  (eq_attr "alternative" "13")
+ (const_string "sse4")
   ]
   (const_string "*")))
(set (attr "type")
  (cond [(eq_attr "alternative" "9,10,11,12,13")
- (const_string "ssemov")
+ (if_then_else (match_test "TARGET_AVX512FP16")
+   (const_string "ssemov")
+   (const_string "sselog"))
(eq_attr "alternative" "4,5,6,7")
  (const_string "mskmov")
(eq_attr "alternative" "8")
@@ -4574,8 +4588,32 @@
   emit_move_insn (operands[0], CONST0_RTX (V2DFmode));
 })
 
-(define_insn "extendhf2"
-  [(set (match_operand:MODEF 0 "nonimm_ssenomem_operand" "=v")
+(define_expand "extendhfsf2"
+  [(set (match_operand:SF 0 "register_operand")
+   (float_extend:SF
+ (match_operand:HF 1 "nonimmediate_operand")))]
+  "TARGET_AVX512FP16 || TARGET_F16C || TARGET_AVX512VL"
+{
+  if (!TARGET_AVX512FP16)
+{
+  rtx res = gen_reg_rtx (V4SFmode);
+  rtx tmp = force_reg (V8HFmode, CONST0_RTX (V8HFmode));
+
+  ix86_expand_vector_set (false, tmp, operands[1], 0);
+  emit_insn (gen_vcvtph2ps (res, gen_lowpart (V8HImode, tmp)));
+  emit_move_insn (operands[0], gen_lowpart (SFmode, res));
+  DONE;
+}
+})
+
+(define_expand "extendhfdf2"
+  [(set (match_operand:DF 0 "register_operand")
+   (float_extend:DF
+ (match_operand:HF 1 "nonimmediate_operand")))]
+  "TARGET_AVX512FP16")
+
+(define_insn "*extendhf2"
+  [(set (match_operand:MODEF 0 "register_operand" "=v")
 (float_extend:MODEF
  (match_operand:HF 1 "nonimmediate_operand" "vm")))]
   "TARGET_AVX512FP16"
@@ -4766,7 +4804,31 @@
 
 ;; Conversion from {SF,DF}mode to HFmode.
 
-(define_insn "trunchf2"
+(define_expand "truncsfhf2"
+  

[PATCH] i386: add alias for f*mul_*ch intrinsics

2021-11-16 Thread Kong, Lingling via Gcc-patches
Hi,

This patch is to add alias for f*mul_*ch intrinsics. 

Ok for master?

gcc/ChangeLog:

* config/i386/avx512fp16intrin.h (_mm512_mul_pch): Add alias for 
_mm512_fmul_pch.
(_mm512_mask_mul_pch): Likewise.
(_mm512_maskz_mul_pch): Likewise.
(_mm512_mul_round_pch): Likewise.
(_mm512_mask_mul_round_pch): Likewise.
(_mm512_maskz_mul_round_pch): Likewise.
(_mm512_cmul_pch): Likewise.
(_mm512_mask_cmul_pch): Likewise.
(_mm512_maskz_cmul_pch): Likewise.
(_mm512_cmul_round_pch): Likewise.
(_mm512_mask_cmul_round_pch): Likewise.
(_mm512_maskz_cmul_round_pch): Likewise.
(_mm_mul_sch): Likewise.
(_mm_mask_mul_sch): Likewise.
(_mm_maskz_mul_sch): Likewise.
(_mm_mul_round_sch): Likewise.
(_mm_mask_mul_round_sch): Likewise.
(_mm_maskz_mul_round_sch): Likewise.
(_mm_cmul_sch): Likewise.
(_mm_mask_cmul_sch): Likewise.
(_mm_maskz_cmul_sch): Likewise.
(_mm_cmul_round_sch): Likewise.
(_mm_mask_cmul_round_sch): Likewise.
(_mm_maskz_cmul_round_sch): Likewise.
* config/i386/avx512fp16vlintrin.h (_mm_mul_pch): Likewise.
(_mm_mask_mul_pch): Likewise.
(_mm_maskz_mul_pch): Likewise.
(_mm256_mul_pch): Likewise.
(_mm256_mask_mul_pch): Likewise.
(_mm256_maskz_mul_pch): Likewise.
(_mm_cmul_pch): Likewise.
(_mm_mask_cmul_pch): Likewise.
(_mm_maskz_cmul_pch): Likewise.
(_mm256_cmul_pch): Likewise.
(_mm256_mask_cmul_pch): Likewise.
(_mm256_maskz_cmul_pch): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512fp16-vfcmulcph-1a.c: Add new test for alias.
* gcc.target/i386/avx512fp16-vfcmulcsh-1a.c: Likewise.
* gcc.target/i386/avx512fp16-vfmulcph-1a.c: Likewise.
* gcc.target/i386/avx512fp16-vfmulcsh-1a.c: Likewise.
* gcc.target/i386/avx512fp16vl-vfcmulcph-1a.c: Likewise.
* gcc.target/i386/avx512fp16vl-vfmulcph-1a.c: Likewise.
---
 gcc/config/i386/avx512fp16intrin.h| 39 +++
 gcc/config/i386/avx512fp16vlintrin.h  | 17 
 .../gcc.target/i386/avx512fp16-vfcmulcph-1a.c | 19 ++---  
.../gcc.target/i386/avx512fp16-vfcmulcsh-1a.c | 19 ++---  
.../gcc.target/i386/avx512fp16-vfmulcph-1a.c  | 19 ++---  
.../gcc.target/i386/avx512fp16-vfmulcsh-1a.c  | 19 ++---
 .../i386/avx512fp16vl-vfcmulcph-1a.c  | 20 +++---
 .../i386/avx512fp16vl-vfmulcph-1a.c   | 20 +++---
 8 files changed, 136 insertions(+), 36 deletions(-)

diff --git a/gcc/config/i386/avx512fp16intrin.h 
b/gcc/config/i386/avx512fp16intrin.h
index 44c5e24f234..fe73e693897 100644
--- a/gcc/config/i386/avx512fp16intrin.h
+++ b/gcc/config/i386/avx512fp16intrin.h
@@ -7162,6 +7162,45 @@ _mm512_set1_pch (_Float16 _Complex __A)
   return (__m512h) _mm512_set1_ps (u.b);  }
 
+// intrinsics below are alias for f*mul_*ch #define _mm512_mul_pch(A, 
+B) _mm512_fmul_pch ((A), (B))
+#define _mm512_mask_mul_pch(W, U, A, B)  \
+  _mm512_mask_fmul_pch ((W), (U), (A), (B)) #define 
+_mm512_maskz_mul_pch(U, A, B) _mm512_maskz_fmul_pch ((U), (A), (B)) 
+#define _mm512_mul_round_pch(A, B, R) _mm512_fmul_round_pch ((A), (B), (R))
+#define _mm512_mask_mul_round_pch(W, U, A, B, R) \
+  _mm512_mask_fmul_round_pch ((W), (U), (A), (B), (R))
+#define _mm512_maskz_mul_round_pch(U, A, B, R)   \
+  _mm512_maskz_fmul_round_pch ((U), (A), (B), (R))
+
+#define _mm512_cmul_pch(A, B) _mm512_fcmul_pch ((A), (B))
+#define _mm512_mask_cmul_pch(W, U, A, B) \
+  _mm512_mask_fcmul_pch ((W), (U), (A), (B)) #define 
+_mm512_maskz_cmul_pch(U, A, B) _mm512_maskz_fcmul_pch ((U), (A), (B)) 
+#define _mm512_cmul_round_pch(A, B, R) _mm512_fcmul_round_pch ((A), (B), (R))
+#define _mm512_mask_cmul_round_pch(W, U, A, B, R)\
+  _mm512_mask_fcmul_round_pch ((W), (U), (A), (B), (R))
+#define _mm512_maskz_cmul_round_pch(U, A, B, R)  \
+  _mm512_maskz_fcmul_round_pch ((U), (A), (B), (R))
+
+#define _mm_mul_sch(A, B) _mm_fmul_sch ((A), (B)) #define 
+_mm_mask_mul_sch(W, U, A, B) _mm_mask_fmul_sch ((W), (U), (A), (B)) 
+#define _mm_maskz_mul_sch(U, A, B) _mm_maskz_fmul_sch ((U), (A), (B)) 
+#define _mm_mul_round_sch(A, B, R) _mm_fmul_round_sch ((A), (B), (R))
+#define _mm_mask_mul_round_sch(W, U, A, B, R)\
+  _mm_mask_fmul_round_sch ((W), (U), (A), (B), (R))
+#define _mm_maskz_mul_round_sch(U, A, B, R)  \
+  _mm_maskz_fmul_round_sch ((U), (A), (B), (R))
+
+#define _mm_cmul_sch(A, B) _mm_fcmul_sch ((A), (B)) #define 
+_mm_mask_cmul_sch(W, U, A, B) _mm_mask_fcmul_sch ((W), (U), (A), (B)) 
+#define _mm_maskz_cmul_sch(U, A, B) _mm_maskz_fcmul_sch ((U), (A), (B)) 
+#define _mm_cmul_round_sch(A, B, R) _mm_fcmul_round_sch ((A), (B), (R))

[PATCH] i386: vcvtph2ps and vcvtps2ph should be used to convert _Float16 to SFmode with -mf16c [PR 102811]

2021-11-16 Thread Kong, Lingling via Gcc-patches
Hi,

vcvtph2ps and vcvtps2ph should be used to convert _Float16 to SFmode with 
-mf16c. So added define_insn extendhfsf2 and truncsfhf2 for target_f16c.

OK for master?

gcc/ChangeLog:

PR target/102811
* config/i386/i386.md (extendhfsf2): Add extenndhfsf2 for f16c.
(extendhfdf2): Split extendhf2 into separate extendhfsf2, 
extendhfdf2.
(truncsfhf2): Likewise.
(truncdfhf2): Likewise.

gcc/testsuite/ChangeLog:

PR target/102811
* gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c: New test.
---
 gcc/config/i386/i386.md   | 48 +++
 .../i386/avx512vl-vcvtps2ph-pr102811.c| 10 
 2 files changed, 49 insertions(+), 9 deletions(-)  create mode 100644 
gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
6eb9de81921..c5415475342 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -4574,15 +4574,30 @@
   emit_move_insn (operands[0], CONST0_RTX (V2DFmode));
 })
 
-(define_insn "extendhf2"
-  [(set (match_operand:MODEF 0 "nonimm_ssenomem_operand" "=v")
-(float_extend:MODEF
+(define_insn "extendhfsf2"
+  [(set (match_operand:SF 0 "register_operand" "=v")
+   (float_extend:SF
+ (match_operand:HF 1 "nonimmediate_operand" "vm")))]
+  "TARGET_AVX512FP16 || TARGET_F16C || TARGET_AVX512VL"
+{
+  if (TARGET_AVX512FP16)
+return "vcvtsh2ss\t{%1, %0, %0|%0, %0, %1}";
+  else
+return "vcvtph2ps\t{%1, %0|%0, %1}"; }
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "maybe_evex")
+   (set_attr "mode" "SF")])
+
+(define_insn "extendhfdf2"
+  [(set (match_operand:DF 0 "nonimm_ssenomem_operand" "=v")
+   (float_extend:DF
  (match_operand:HF 1 "nonimmediate_operand" "vm")))]
   "TARGET_AVX512FP16"
-  "vcvtsh2\t{%1, %0, %0|%0, %0, %1}"
+  "vcvtsh2sd\t{%1, %0, %0|%0, %0, %1}"
   [(set_attr "type" "ssecvt")
(set_attr "prefix" "evex")
-   (set_attr "mode" "")])
+   (set_attr "mode" "DF")])
 
 
 (define_expand "extendxf2"
@@ -4766,12 +4781,27 @@
 
 ;; Conversion from {SF,DF}mode to HFmode.
 
-(define_insn "trunchf2"
+(define_insn "truncsfhf2"
+  [(set (match_operand:HF 0 "register_operand" "=v")
+   (float_truncate:HF
+ (match_operand:SF 1 "nonimmediate_operand" "vm")))]
+  "TARGET_AVX512FP16 || TARGET_F16C || TARGET_AVX512VL"
+  {
+if (TARGET_AVX512FP16)
+  return "vcvtss2sh\t{%1, %d0|%d0, %1}";
+else
+  return "vcvtps2ph\t{0, %1, %0|%0, %1, 0}";
+  }
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "HF")])
+
+(define_insn "truncdfhf2"
   [(set (match_operand:HF 0 "register_operand" "=v")
-   (float_truncate:HF
- (match_operand:MODEF 1 "nonimmediate_operand" "vm")))]
+   (float_truncate:HF
+ (match_operand:DF 1 "nonimmediate_operand" "vm")))]
   "TARGET_AVX512FP16"
-  "vcvt2sh\t{%1, %d0|%d0, %1}"
+  "vcvtsd2sh\t{%1, %d0|%d0, %1}"
   [(set_attr "type" "ssecvt")
(set_attr "prefix" "evex")
(set_attr "mode" "HF")])
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c 
b/gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c
new file mode 100644
index 000..ab44a304a03
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mf16c -mno-avx512fp16" } */
+/* { dg-final { scan-assembler-times "vcvtph2ps\[ \\t\]" 2 } } */
+/* { dg-final { scan-assembler-times "vcvtps2ph\[ \\t\]" 1 } } */
+/* { dg-final { scan-assembler-not "__truncsfhf2\[ \\t\]"} } */
+/* { dg-final { scan-assembler-not "__extendhfsf2\[ \\t\]"} } */
+_Float16 test (_Float16 a, _Float16 b)
+{
+  return a + b;
+}
--
2.18.1



[PATCH] i386: Optimization for mm512_set1_pch.

2021-11-05 Thread Kong, Lingling via Gcc-patches
Hi,

This patch is to support fold _mm512_fmadd_pch (a, _mm512_set1_pch(*(b)), c) to 
1 instruction vfmaddcph (%rsp){1to16}, %zmm1, %zmm2.
OK for master?

gcc/ChangeLog:

* config/i386/sse.md (fma___pair):
Add new define_insn.
(fma__fmaddc_bcst): Add new define_insn_and_split.
(fma__fcmaddc_bcst): Likewise

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512fp16vl-complex-broadcast-1.c: New test.
---
 gcc/config/i386/sse.md| 62 +++
 .../i386/avx512fp16vl-complex-broadcast-1.c   | 25 
 2 files changed, 87 insertions(+)
 create mode 100644 
gcc/testsuite/gcc.target/i386/avx512fp16vl-complex-broadcast-1.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 
0a7f5b178f9..eba8e77515f 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -193,7 +193,9 @@
 
   ;; For AVX512FP16 suppport
   UNSPEC_COMPLEX_FMA
+  UNSPEC_COMPLEX_FMA_PAIR
   UNSPEC_COMPLEX_FCMA
+  UNSPEC_COMPLEX_FCMA_PAIR
   UNSPEC_COMPLEX_FMUL
   UNSPEC_COMPLEX_FCMUL
   UNSPEC_COMPLEX_MASK
@@ -5913,6 +5915,9 @@
 (define_int_iterator UNSPEC_COMPLEX_F_C_MA
[UNSPEC_COMPLEX_FMA UNSPEC_COMPLEX_FCMA])
 
+(define_int_iterator UNSPEC_COMPLEX_F_C_MA_PAIR
+   [UNSPEC_COMPLEX_FMA_PAIR UNSPEC_COMPLEX_FCMA_PAIR])
+
 (define_int_iterator UNSPEC_COMPLEX_F_C_MUL
[UNSPEC_COMPLEX_FMUL UNSPEC_COMPLEX_FCMUL])
 
@@ -5922,6 +5927,10 @@
 (UNSPEC_COMPLEX_FMUL "fmulc")
 (UNSPEC_COMPLEX_FCMUL "fcmulc")])
 
+(define_int_attr complexpairopname
+   [(UNSPEC_COMPLEX_FMA_PAIR "fmaddc")
+(UNSPEC_COMPLEX_FCMA_PAIR "fcmaddc")])
+
 (define_mode_attr complexmove
   [(V32HF "avx512f_loadv16sf")
(V16HF "avx512vl_loadv8sf")
@@ -6067,6 +6076,59 @@
  [(match_dup 1) (match_dup 2) (match_dup 4)]
   UNSPEC_COMPLEX_F_C_MA))])
 
+(define_insn "fma___pair"
+ [(set (match_operand:VF1_AVX512VL 0 "register_operand" "=")
+   (unspec:VF1_AVX512VL
+[(match_operand:VF1_AVX512VL 1 "vector_operand" "%v")
+ (match_operand:VF1_AVX512VL 2 "bcst_vector_operand" "vmBr")
+ (match_operand:VF1_AVX512VL 3 "vector_operand" "0")]
+ UNSPEC_COMPLEX_F_C_MA_PAIR))]
+ "TARGET_AVX512FP16"
+ "vph\t{%2, %1, %0|%0, %1, %2}"
+ [(set_attr "type" "ssemuladd")])
+
+(define_insn_and_split "fma__fmaddc_bcst"
+  [(set (match_operand:VF_AVX512FP16VL 0 "register_operand")
+   (unspec:VF_AVX512FP16VL
+ [(match_operand:VF_AVX512FP16VL 1 "vector_operand")
+  (subreg:VF_AVX512FP16VL
+(match_operand: 2 "bcst_vector_operand") 0)
+  (match_operand:VF_AVX512FP16VL 3 "vector_operand")]
+  UNSPEC_COMPLEX_FMA))]
+  "TARGET_AVX512FP16"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+   (unspec:
+ [(match_dup 1) (match_dup 2) (match_dup 3)]
+  UNSPEC_COMPLEX_FMA_PAIR))]
+  {
+operands[0] = lowpart_subreg (mode, operands[0], mode);
+operands[1] = lowpart_subreg (mode, operands[1], mode);
+operands[3] = lowpart_subreg (mode, operands[3], 
+mode);
+  })
+
+(define_insn_and_split "fma__fcmaddc_bcst"
+  [(set (match_operand:VF_AVX512FP16VL 0 "register_operand")
+   (unspec:VF_AVX512FP16VL
+ [(match_operand:VF_AVX512FP16VL 1 "vector_operand")
+  (subreg:VF_AVX512FP16VL
+(match_operand: 2 "bcst_vector_operand") 0)
+  (match_operand:VF_AVX512FP16VL 3 "vector_operand")]
+  UNSPEC_COMPLEX_FCMA))]
+  "TARGET_AVX512FP16"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+   (unspec:
+ [(match_dup 1) (match_dup 2) (match_dup 3)]
+  UNSPEC_COMPLEX_FCMA_PAIR))]
+  {
+operands[0] = lowpart_subreg (mode, operands[0], mode);
+operands[1] = lowpart_subreg (mode, operands[1], mode);
+operands[3] = lowpart_subreg (mode, operands[3], 
+mode);
+  })
+
 (define_insn "___mask"
   [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=")
(vec_merge:VF_AVX512FP16VL
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16vl-complex-broadcast-1.c 
b/gcc/testsuite/gcc.target/i386/avx512fp16vl-complex-broadcast-1.c
new file mode 100644
index 000..3c8e84230f3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16vl-complex-broadcast-1.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to4\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to16\\\}" 2 } }  */
+
+#include 
+
+volatile __m512h res0, a0, c0;
+volatile __m256h res1, a1, c1;
+volatile __m128h res2, a2, c2;
+volatile _Float16 *b;
+
+void extern
+avx_test(void)
+{
+  res0 = _mm512_fmadd_pch (a0, _mm512_set1_pch(*(b + 2 * 6)), c0);
+  res0 = _mm512_fcmadd_pch (a0, _mm512_set1_pch(*(b + 2 * 6)), c0);
+
+  res1 = _mm256_fmadd_pch (a1, _mm256_set1_pch(*(b + 2 * 6)), c1);
+  res1 = _mm256_fcmadd_pch (a1, _mm256_set1_pch(*(b + 2 * 6)), c1);
+
+  res2 =  

[PATCH] i386: Support complex fma/conj_fma for _Float16.

2021-11-05 Thread Kong, Lingling via Gcc-patches
Hi,

This patch is to support cmla_optab, cmul_optab, cmla_conj_optab, 
cmul_conj_optab for vector _Float16.
Ok for master?

gcc/ChangeLog:

* config/i386/sse.md (cmul3): add new define_expand.
(cmla4): Likewise

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512fp16-vector-complex-float.c: New test.
---
 gcc/config/i386/sse.md| 23 +++
 .../i386/avx512fp16-vector-complex-float.c| 40 +++
 2 files changed, 63 insertions(+)
 create mode 100644 
gcc/testsuite/gcc.target/i386/avx512fp16-vector-complex-float.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 
0a7f5b178f9..8d3fef0a31a 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -5922,6 +5922,12 @@
 (UNSPEC_COMPLEX_FMUL "fmulc")
 (UNSPEC_COMPLEX_FCMUL "fcmulc")])
 
+(define_int_attr conj_op
+   [(UNSPEC_COMPLEX_FMA "")
+(UNSPEC_COMPLEX_FCMA "_conj")
+(UNSPEC_COMPLEX_FMUL "")
+(UNSPEC_COMPLEX_FCMUL "_conj")])
+
 (define_mode_attr complexmove
   [(V32HF "avx512f_loadv16sf")
(V16HF "avx512vl_loadv8sf")
@@ -6003,6 +6009,15 @@
   DONE;
 })
 
+(define_expand "cmla4"
+  [(set (match_operand:VF_AVX512FP16VL 0 "register_operand")
+   (unspec:VF_AVX512FP16VL
+   [(match_operand:VF_AVX512FP16VL 1 "vector_operand")
+(match_operand:VF_AVX512FP16VL 2 "vector_operand")
+(match_operand:VF_AVX512FP16VL 3 "vector_operand")]
+UNSPEC_COMPLEX_F_C_MA))]
+  "TARGET_AVX512FP16")
+
 (define_insn "fma__"
   [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=")
(unspec:VF_AVX512FP16VL
@@ -6084,6 +6099,14 @@
   [(set_attr "type" "ssemuladd")
(set_attr "mode" "")])
 
+(define_expand "cmul3"
+  [(set (match_operand:VF_AVX512FP16VL 0 "register_operand")
+   (unspec:VF_AVX512FP16VL
+ [(match_operand:VF_AVX512FP16VL 1 "vector_operand")
+  (match_operand:VF_AVX512FP16VL 2 "vector_operand")]
+  UNSPEC_COMPLEX_F_C_MUL))]
+  "TARGET_AVX512FP16")
+
 (define_insn "__"
   [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=")
  (unspec:VF_AVX512FP16VL
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vector-complex-float.c 
b/gcc/testsuite/gcc.target/i386/avx512fp16-vector-complex-float.c
new file mode 100644
index 000..bcb957f0de0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vector-complex-float.c
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
+/* { dg-final { scan-assembler-times "vfmaddcph\[ \\t\]" 1 } } */
+/* { dg-final { scan-assembler-not "vfmadd\[123]*ph\[ \\t\]"} } */
+/* { dg-final { scan-assembler-not "vfmadd\[123]*sh\[ \\t\]"} } */
+/* { dg-final { scan-assembler-times "vfcmaddcph\[ \\t\]" 1 } } */
+/* { dg-final { scan-assembler-times "vfmulcph\[ \\t\]" 1 } } */
+/* { dg-final { scan-assembler-times "vfcmulcph\[ \\t\]" 1 } } */
+
+#include
+#define TYPE _Float16
+#define N 16
+
+void fma0 (_Complex TYPE *a, _Complex TYPE *b,
+   _Complex TYPE *c)
+{
+  for (int i = 0; i < N; i++)
+c[i] += a[i] * b[i];
+}
+
+void fmaconj (_Complex TYPE a[restrict N], _Complex TYPE b[restrict N],
+ _Complex TYPE c[restrict N])
+{
+  for (int i = 0; i < N; i++)
+c[i] += a[i] * ~b[i];
+}
+
+void fmul (_Complex TYPE a[restrict N], _Complex TYPE b[restrict N],
+  _Complex TYPE c[restrict N])
+{
+  for (int i = 0; i < N; i++)
+c[i] = a[i] * b[i];
+}
+
+void fmulconj (_Complex TYPE a[restrict N], _Complex TYPE b[restrict N],
+  _Complex TYPE c[restrict N])
+{
+  for (int i = 0; i < N; i++)
+c[i] = a[i] * ~b[i];
+}
--
2.18.1



[PATCH] i386: Combine the FADD(A, FMA(B, C, 0)) to FMA(B, C, A) and combine FADD(A, FMUL(B, C)) to FMA(B, C, A).

2021-10-21 Thread Kong, Lingling via Gcc-patches
Hi,

This patch is to support transform in fast-math something like 
_mm512_add_ph(x1, _mm512_fmadd_pch(a, b, _mm512_setzero_ph())) to  
_mm512_fmadd_pch(a, b, x1).

And support transform _mm512_add_ph(x1, _mm512_fmul_pch(a, b)) to 
_mm512_fmadd_pch(a, b, x1).
Ok for master?

gcc/ChangeLog:

* config/i386/sse.md (fma__fadd_fmul): Add new
define_insn_and_split.
(fma__fadd_fcmul):Likewise
(fma___fma_zero):Likewise

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512fp16-complex-fma.c: New test.
---
 gcc/config/i386/sse.md| 52 +++
 .../gcc.target/i386/avx512fp16-complex-fma.c  | 18 +++
 2 files changed, 70 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-complex-fma.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 
fbf056bf9e6..36407ca4a59 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -5958,6 +5958,58 @@
   [(set_attr "type" "ssemuladd")
(set_attr "mode" "")])
 
+(define_insn_and_split "fma__fadd_fmul"
+  [(set (match_operand:VF_AVX512FP16VL 0 "register_operand")
+   (plus:VF_AVX512FP16VL
+ (unspec:VF_AVX512FP16VL
+   [(match_operand:VF_AVX512FP16VL 1 "vector_operand")
+(match_operand:VF_AVX512FP16VL 2 "vector_operand")]
+UNSPEC_COMPLEX_FMUL)
+ (match_operand:VF_AVX512FP16VL 3 "vector_operand")))]
+  "TARGET_AVX512FP16 && flag_unsafe_math_optimizations
+  && ix86_pre_reload_split()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+   (unspec:VF_AVX512FP16VL
+ [(match_dup 1) (match_dup 2) (match_dup 3)]
+  UNSPEC_COMPLEX_FMA))])
+
+(define_insn_and_split "fma__fadd_fcmul"
+  [(set (match_operand:VF_AVX512FP16VL 0 "register_operand")
+   (plus:VF_AVX512FP16VL
+ (unspec:VF_AVX512FP16VL
+   [(match_operand:VF_AVX512FP16VL 1 "vector_operand")
+(match_operand:VF_AVX512FP16VL 2 "vector_operand")]
+UNSPEC_COMPLEX_FCMUL)
+ (match_operand:VF_AVX512FP16VL 3 "vector_operand")))]
+  "TARGET_AVX512FP16 && flag_unsafe_math_optimizations
+  && ix86_pre_reload_split()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+   (unspec:VF_AVX512FP16VL
+ [(match_dup 1) (match_dup 2) (match_dup 3)]
+  UNSPEC_COMPLEX_FCMA))])
+
+(define_insn_and_split "fma___fma_zero"
+  [(set (match_operand:VF_AVX512FP16VL 0 "register_operand")
+   (plus:VF_AVX512FP16VL
+ (unspec:VF_AVX512FP16VL
+   [(match_operand:VF_AVX512FP16VL 1 "vector_operand")
+(match_operand:VF_AVX512FP16VL 2 "vector_operand")
+(match_operand:VF_AVX512FP16VL 3 "const0_operand")]
+UNSPEC_COMPLEX_F_C_MA)
+ (match_operand:VF_AVX512FP16VL 4 "vector_operand")))]
+  "TARGET_AVX512FP16 && flag_unsafe_math_optimizations
+  && ix86_pre_reload_split()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+   (unspec:VF_AVX512FP16VL
+ [(match_dup 1) (match_dup 2) (match_dup 4)]
+  UNSPEC_COMPLEX_F_C_MA))])
+
 (define_insn "___mask"
   [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=")
(vec_merge:VF_AVX512FP16VL
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-complex-fma.c 
b/gcc/testsuite/gcc.target/i386/avx512fp16-complex-fma.c
new file mode 100644
index 000..2dfd369e785
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-complex-fma.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -O2 -Ofast" } */
+/* { dg-final { scan-assembler-times "vfmaddcph\[ 
+\\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(
+?:\n|\[ \\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-not "vaddph\[ 
+\\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(
+?:\n|\[ \\t\]+#)"} } */
+/* { dg-final { scan-assembler-not "vfmulcph\[ 
+\\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(
+?:\n|\[ \\t\]+#)"} } */
+/* { dg-final { scan-assembler-times "vfcmaddcph\[ 
+\\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(
+?:\n|\[ \\t\]+#)" 2 } } */
+
+#include 
+volatile __m512h x1, x2, res, a, b;
+void extern
+avx512f_test (void)
+{
+  res = _mm512_add_ph (x1, _mm512_fmadd_pch (a, b, 
+_mm512_setzero_ph()));
+  res = _mm512_add_ph (x1, _mm512_fcmadd_pch (a, b, 
+_mm512_setzero_ph()));
+
+  res = _mm512_add_ph (x1, _mm512_fmul_pch (a, b));
+  res = _mm512_add_ph (x1, _mm512_fcmul_pch (a, b)); }
--
2.18.1



[PATCH] i386: Fix wrong optimization for consecutive masked scatters [PR 101472]

2021-08-26 Thread Kong, Lingling via Gcc-patches
Hi,

For avx512f_scattersi, mask operand only affect set src, we need to 
refine the pattern to let gcc know mask register also affect the dest.
So we put mask operand into UNSPEC_VSIBADDR.

Bootstrapped and regression tested on x86_64-linux-gnu{-m32,-m64}.
Ok for master?

gcc/ChangeLog:

PR target/101472
* config/i386/sse.md: (scattersi): Add mask operand to
UNSPEC_VSIBADDR.
(scattersi): Likewise.
(*avx512f_scattersi): Merge mask operand to set_dest.
(*avx512f_scatterdi): Likewise

gcc/testsuite/ChangeLog:

PR target/101472
* gcc.target/i386/avx512f-pr101472.c: New test.
* gcc.target/i386/avx512vl-pr101472.c: New test.
---
 gcc/config/i386/sse.md| 20 +++--
 .../gcc.target/i386/avx512f-pr101472.c| 49 
 .../gcc.target/i386/avx512vl-pr101472.c   | 79 +++
 3 files changed, 140 insertions(+), 8 deletions(-)  create mode 100644 
gcc/testsuite/gcc.target/i386/avx512f-pr101472.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512vl-pr101472.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 
03fc2df1fb0..a3055dbd316 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -24205,8 +24205,9 @@
   "TARGET_AVX512F"
 {
   operands[5]
-= gen_rtx_UNSPEC (Pmode, gen_rtvec (3, operands[0], operands[2],
-   operands[4]), UNSPEC_VSIBADDR);
+= gen_rtx_UNSPEC (Pmode, gen_rtvec (4, operands[0], operands[2],
+   operands[4], operands[1]), 
+   UNSPEC_VSIBADDR);
 })
 
 (define_insn "*avx512f_scattersi"
@@ -24214,10 +24215,11 @@
  [(unspec:P
 [(match_operand:P 0 "vsib_address_operand" "Tv")
  (match_operand: 2 "register_operand" "v")
- (match_operand:SI 4 "const1248_operand" "n")]
+ (match_operand:SI 4 "const1248_operand" "n")
+ (match_operand: 6 "register_operand" "1")]
 UNSPEC_VSIBADDR)])
(unspec:VI48F
- [(match_operand: 6 "register_operand" "1")
+ [(match_dup 6)
   (match_operand:VI48F 3 "register_operand" "v")]
  UNSPEC_SCATTER))
(clobber (match_scratch: 1 "="))] @@ -24243,8 +24245,9 
@@
   "TARGET_AVX512F"
 {
   operands[5]
-= gen_rtx_UNSPEC (Pmode, gen_rtvec (3, operands[0], operands[2],
-   operands[4]), UNSPEC_VSIBADDR);
+= gen_rtx_UNSPEC (Pmode, gen_rtvec (4, operands[0], operands[2],
+   operands[4], operands[1]), 
+   UNSPEC_VSIBADDR);
 })
 
 (define_insn "*avx512f_scatterdi"
@@ -24252,10 +24255,11 @@
  [(unspec:P
 [(match_operand:P 0 "vsib_address_operand" "Tv")
  (match_operand: 2 "register_operand" "v")
- (match_operand:SI 4 "const1248_operand" "n")]
+ (match_operand:SI 4 "const1248_operand" "n")
+ (match_operand:QI 6 "register_operand" "1")]
 UNSPEC_VSIBADDR)])
(unspec:VI48F
- [(match_operand:QI 6 "register_operand" "1")
+ [(match_dup 6)
   (match_operand: 3 "register_operand" "v")]
  UNSPEC_SCATTER))
(clobber (match_scratch:QI 1 "="))] diff --git 
a/gcc/testsuite/gcc.target/i386/avx512f-pr101472.c 
b/gcc/testsuite/gcc.target/i386/avx512f-pr101472.c
new file mode 100644
index 000..89c6603c2ff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr101472.c
@@ -0,0 +1,49 @@
+/* PR target/101472 */
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -O2" } */
+/* { dg-final { scan-assembler-times "vpscatterqd\[ 
+\\t\]+\[^\{\n\]*ymm\[0-9\]\[^\n\]*zmm\[0-9\]\[^\n\]*{%k\[1-7\]}(?:\n|\[ 
+\\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vpscatterdd\[ 
+\\t\]+\[^\{\n\]*zmm\[0-9\]\[^\n\]*zmm\[0-9\]\[^\n\]*{%k\[1-7\]}(?:\n|\[ 
+\\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vpscatterqq\[ 
+\\t\]+\[^\{\n\]*zmm\[0-9\]\[^\n\]*zmm\[0-9\]\[^\n\]*{%k\[1-7\]}(?:\n|\[ 
+\\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vpscatterdq\[ 
+\\t\]+\[^\{\n\]*zmm\[0-9\]\[^\n\]*ymm\[0-9\]\[^\n\]*{%k\[1-7\]}(?:\n|\[ 
+\\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vscatterqps\[ 
+\\t\]+\[^\{\n\]*ymm\[0-9\]\[^\n\]*zmm\[0-9\]\[^\n\]*{%k\[1-7\]}(?:\n|\[ 
+\\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vscatterdps\[ 
+\\t\]+\[^\{\n\]*zmm\[0-9\]\[^\n\]*zmm\[0-9\]\[^\n\]*{%k\[1-7\]}(?:\n|\[ 
+\\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vscatterqpd\[ 
+\\t\]+\[^\{\n\]*zmm\[0-9\]\[^\n\]*zmm\[0-9\]\[^\n\]*{%k\[1-7\]}(?:\n|\[ 
+\\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vscatterdpd\[ 
+\\t\]+\[^\{\n\]*zmm\[0-9\]\[^\n\]*ymm\[0-9\]\[^\n\]*{%k\[1-7\]}(?:\n|\[ 
+\\t\]+#)" 2 } } */
+
+#include 
+
+void two_scatters_epi32(void* addr, __mmask8 k1, __mmask8 k2, __m512i vindex, 
+__m256i a, __m512i b)
+{
+ 

[PATCH] i386: Fix wrong optimization for consecutive masked scatters [PR 101472]

2021-08-25 Thread Kong, Lingling via Gcc-patches
Hi,

For avx512f_scattersi, mask operand only affect set src, we
need to refine the pattern to let gcc know mask register also affect the dest.
So we put mask operand into UNSPEC_VSIBADDR.

Bootstrapped and regression tested on x86_64-linux-gnu{-m32,-m64}.
Ok for master?

gcc/ChangeLog:

*config/i386/sse.md (scattersi): Add mask operand to
UNSPEC_VSIBADDR.
(scattersi): Likewise.
(*avx512f_scattersi): Merge mask operand
to set_dest.
(*avx512f_scatterdi): Likewise

gcc/testsuite/ChangeLog:

*gcc.target/i386/avx512f-pr101472.c: New test.
*gcc.target/i386/avx512vl-pr101472.c: Ditto.


0001-i386-Fix-wrong-optimization-for-consecutive-masked-s.patch
Description: 0001-i386-Fix-wrong-optimization-for-consecutive-masked-s.patch


[PATCH] i386: Fix _mm512_fpclass_ps_mask in O0 [PR 101471]

2021-08-25 Thread Kong, Lingling via Gcc-patches
Hi,

For _mm512_fpclass_ps_mask in O0, mask should be (__mmask16)-1 instead of
(__mmask8)-1).

Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
Ok for master?

gcc/ChangeLog:

* gcc/config/i386/avx512dqintrin.h : fix _mm512_fpclass_ps_mask define in O0

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512f-pr101471.c: add new test


0001-i386-Fix-_mm512_fpclass_ps_mask-in-O0-PR-101471.patch
Description: 0001-i386-Fix-_mm512_fpclass_ps_mask-in-O0-PR-101471.patch