diff --git a/gcc/config/aarch64/aarch64-c.c b/gcc/config/aarch64/aarch64-c.c
index 5770280faba035bf956c934b63b811e2b2aea299..c500c493166ca3c02100e33691161c8f883d14ce 100644
--- a/gcc/config/aarch64/aarch64-c.c
+++ b/gcc/config/aarch64/aarch64-c.c
@@ -141,6 +141,7 @@ aarch64_update_cpp_builtins (cpp_reader *pfile)
   aarch64_def_or_undef (TARGET_SHA3, "__ARM_FEATURE_SHA512", pfile);
   aarch64_def_or_undef (TARGET_SM4, "__ARM_FEATURE_SM3", pfile);
   aarch64_def_or_undef (TARGET_SM4, "__ARM_FEATURE_SM4", pfile);
+  aarch64_def_or_undef (TARGET_F16FML, "__ARM_FEATURE_FP16_FML", pfile);
 
   /* Not for ACLE, but required to keep "float.h" correct if we switch
      target between implementations that do or do not support ARMv8.2-A
diff --git a/gcc/config/aarch64/aarch64-modes.def b/gcc/config/aarch64/aarch64-modes.def
index 195976c223aac20a030dbdde04bebc3d96f55ed2..5f7896797d3d67a2f9e041521fb5252a832966a7 100644
--- a/gcc/config/aarch64/aarch64-modes.def
+++ b/gcc/config/aarch64/aarch64-modes.def
@@ -35,6 +35,7 @@ VECTOR_MODES (INT, 16);       /* V16QI V8HI V4SI V2DI.  */
 VECTOR_MODES (FLOAT, 8);      /*                 V2SF.  */
 VECTOR_MODES (FLOAT, 16);     /*            V4SF V2DF.  */
 VECTOR_MODE (FLOAT, DF, 1);   /*                 V1DF.  */
+VECTOR_MODE (FLOAT, HF, 2);   /*                 V2HF.  */
 
 /* Oct Int: 256-bit integer mode needed for 32-byte vector arguments.  */
 INT_MODE (OI, 32);
diff --git a/gcc/config/aarch64/aarch64-option-extensions.def b/gcc/config/aarch64/aarch64-option-extensions.def
index a21c037da3f586b422110ced5f509c9b9e225aad..6bbde817178bfcd06f4df17dd8cc74f9e7f928fd 100644
--- a/gcc/config/aarch64/aarch64-option-extensions.def
+++ b/gcc/config/aarch64/aarch64-option-extensions.def
@@ -63,8 +63,8 @@ AARCH64_OPT_EXTENSION("crc", AARCH64_FL_CRC, 0, 0, "crc32")
 AARCH64_OPT_EXTENSION("lse", AARCH64_FL_LSE, 0, 0, "atomics")
 
 /* Enabling "fp16" also enables "fp".
-   Disabling "fp16" just disables "fp16".  */
-AARCH64_OPT_EXTENSION("fp16", AARCH64_FL_F16, AARCH64_FL_FP, 0, "fphp asimdhp")
+   Disabling "fp16" disables "fp16" and "fp16fml".  */
+AARCH64_OPT_EXTENSION("fp16", AARCH64_FL_F16, AARCH64_FL_FP, AARCH64_FL_F16FML, "fphp asimdhp")
 
 /* Enabling or disabling "rcpc" only changes "rcpc".  */
 AARCH64_OPT_EXTENSION("rcpc", AARCH64_FL_RCPC, 0, 0, "lrcpc")
@@ -93,4 +93,8 @@ AARCH64_OPT_EXTENSION("sha3", AARCH64_FL_SHA3, AARCH64_FL_SIMD | AARCH64_FL_SHA2
    Disabling "sm4" just disables "sm4".  */
 AARCH64_OPT_EXTENSION("sm4", AARCH64_FL_SM4, AARCH64_FL_SIMD, 0, "sm3 sm4")
 
+/* Enabling "fp16fml" also enables "fp" and "fp16".
+   Disabling "fp16fml" just disables "fp16fml".  */
+AARCH64_OPT_EXTENSION("fp16fml", AARCH64_FL_F16FML, AARCH64_FL_FP | AARCH64_FL_F16, 0, "asimdfml")
+
 #undef AARCH64_OPT_EXTENSION
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 8329f8dd8aa45d4d8600718fd2221e55dcac79af..1a05f46b3231ffa867cc840df2a79328c85f48a7 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -603,3 +603,38 @@
   VAR1 (TERNOPUI, xarq, 0, v2di)
   /* Implemented by aarch64_bcaxqv8hi.  */
   VAR1 (TERNOPU, bcaxq, 0, v8hi)
+
+  /* Implemented by aarch64_fml<f16mac1>l<f16quad>_low<mode>.  */
+  VAR1 (TERNOP, fmlal_low, 0, v2sf)
+  VAR1 (TERNOP, fmlsl_low, 0, v2sf)
+  VAR1 (TERNOP, fmlalq_low, 0, v4sf)
+  VAR1 (TERNOP, fmlslq_low, 0, v4sf)
+  /* Implemented by aarch64_fml<f16mac1>l<f16quad>_high<mode>.  */
+  VAR1 (TERNOP, fmlal_high, 0, v2sf)
+  VAR1 (TERNOP, fmlsl_high, 0, v2sf)
+  VAR1 (TERNOP, fmlalq_high, 0, v4sf)
+  VAR1 (TERNOP, fmlslq_high, 0, v4sf)
+  /* Implemented by aarch64_fml<f16mac1>l_lane_lowv2sf.  */
+  VAR1 (QUADOP_LANE, fmlal_lane_low, 0, v2sf)
+  VAR1 (QUADOP_LANE, fmlsl_lane_low, 0, v2sf)
+  /* Implemented by aarch64_fml<f16mac1>l_laneq_lowv2sf.  */
+  VAR1 (QUADOP_LANE, fmlal_laneq_low, 0, v2sf)
+  VAR1 (QUADOP_LANE, fmlsl_laneq_low, 0, v2sf)
+  /* Implemented by aarch64_fml<f16mac1>lq_lane_lowv4sf.  */
+  VAR1 (QUADOP_LANE, fmlalq_lane_low, 0, v4sf)
+  VAR1 (QUADOP_LANE, fmlslq_lane_low, 0, v4sf)
+  /* Implemented by aarch64_fml<f16mac1>lq_laneq_lowv4sf.  */
+  VAR1 (QUADOP_LANE, fmlalq_laneq_low, 0, v4sf)
+  VAR1 (QUADOP_LANE, fmlslq_laneq_low, 0, v4sf)
+  /* Implemented by aarch64_fml<f16mac1>l_lane_highv2sf.  */
+  VAR1 (QUADOP_LANE, fmlal_lane_high, 0, v2sf)
+  VAR1 (QUADOP_LANE, fmlsl_lane_high, 0, v2sf)
+  /* Implemented by aarch64_fml<f16mac1>l_laneq_highv2sf.  */
+  VAR1 (QUADOP_LANE, fmlal_laneq_high, 0, v2sf)
+  VAR1 (QUADOP_LANE, fmlsl_laneq_high, 0, v2sf)
+  /* Implemented by aarch64_fml<f16mac1>lq_lane_highv4sf.  */
+  VAR1 (QUADOP_LANE, fmlalq_lane_high, 0, v4sf)
+  VAR1 (QUADOP_LANE, fmlslq_lane_high, 0, v4sf)
+  /* Implemented by aarch64_fml<f16mac1>lq_laneq_highv4sf.  */
+  VAR1 (QUADOP_LANE, fmlalq_laneq_high, 0, v4sf)
+  VAR1 (QUADOP_LANE, fmlslq_laneq_high, 0, v4sf)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 6bc0b06748bea777a780f9ee1271594da271c39d..bdf7c58726d0c7bcdb6dd35e1607ea0216004087 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -6006,6 +6006,596 @@
   [(set_attr "type" "crypto_sm4")]
 )
 
+;; fp16fml
+
+(define_expand "aarch64_fml<f16mac1>l<f16quad>_low<mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+	(unspec:VDQSF
+	 [(match_operand:VDQSF 1 "register_operand" "0")
+	  (match_operand:<VFMLA_W> 2 "register_operand" "w")
+	  (match_operand:<VFMLA_W> 3 "register_operand" "w")]
+	 VFMLA16_LOW))]
+  "TARGET_F16FML"
+{
+  int nunits = GET_MODE_NUNITS (<VFMLA_W>mode);
+  rtx p1 = aarch64_simd_vect_par_cnst_half (<VFMLA_W>mode, nunits, false);
+  rtx p2 = aarch64_simd_vect_par_cnst_half (<VFMLA_W>mode, nunits, false);
+
+  emit_insn (gen_aarch64_simd_fml<f16mac1>l<f16quad>_low<mode> (operands[0],
+								operands[1],
+								operands[2],
+								operands[3],
+								p1, p2));
+  DONE;
+
+})
+
+(define_expand "aarch64_fml<f16mac1>l<f16quad>_high<mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+	(unspec:VDQSF
+	 [(match_operand:VDQSF 1 "register_operand" "0")
+	  (match_operand:<VFMLA_W> 2 "register_operand" "w")
+	  (match_operand:<VFMLA_W> 3 "register_operand" "w")]
+	 VFMLA16_HIGH))]
+  "TARGET_F16FML"
+{
+  int nunits = GET_MODE_NUNITS (<VFMLA_W>mode);
+  rtx p1 = aarch64_simd_vect_par_cnst_half (<VFMLA_W>mode, nunits, true);
+  rtx p2 = aarch64_simd_vect_par_cnst_half (<VFMLA_W>mode, nunits, true);
+
+  emit_insn (gen_aarch64_simd_fml<f16mac1>l<f16quad>_high<mode> (operands[0],
+								 operands[1],
+								 operands[2],
+								 operands[3],
+								 p1, p2));
+  DONE;
+})
+
+(define_insn "aarch64_simd_fmlal<f16quad>_low<mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+	(fma:VDQSF
+	 (float_extend:VDQSF
+	  (vec_select:<VFMLA_SEL_W>
+	   (match_operand:<VFMLA_W> 2 "register_operand" "w")
+	   (match_operand:<VFMLA_W> 4 "vect_par_cnst_lo_half" "")))
+	 (float_extend:VDQSF
+	  (vec_select:<VFMLA_SEL_W>
+	   (match_operand:<VFMLA_W> 3 "register_operand" "w")
+	   (match_operand:<VFMLA_W> 5 "vect_par_cnst_lo_half" "")))
+	 (match_operand:VDQSF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlal\\t%0.<nunits>s, %2.<nunits>h, %3.<nunits>h"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlsl<f16quad>_low<mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+	(fma:VDQSF
+	 (float_extend:VDQSF
+	  (neg:<VFMLA_SEL_W>
+	   (vec_select:<VFMLA_SEL_W>
+	    (match_operand:<VFMLA_W> 2 "register_operand" "w")
+	    (match_operand:<VFMLA_W> 4 "vect_par_cnst_lo_half" ""))))
+	 (float_extend:VDQSF
+	  (vec_select:<VFMLA_SEL_W>
+	   (match_operand:<VFMLA_W> 3 "register_operand" "w")
+	   (match_operand:<VFMLA_W> 5 "vect_par_cnst_lo_half" "")))
+	 (match_operand:VDQSF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlsl\\t%0.<nunits>s, %2.<nunits>h, %3.<nunits>h"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlal<f16quad>_high<mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+	(fma:VDQSF
+	 (float_extend:VDQSF
+	  (vec_select:<VFMLA_SEL_W>
+	   (match_operand:<VFMLA_W> 2 "register_operand" "w")
+	   (match_operand:<VFMLA_W> 4 "vect_par_cnst_hi_half" "")))
+	 (float_extend:VDQSF
+	  (vec_select:<VFMLA_SEL_W>
+	   (match_operand:<VFMLA_W> 3 "register_operand" "w")
+	   (match_operand:<VFMLA_W> 5 "vect_par_cnst_hi_half" "")))
+	 (match_operand:VDQSF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlal2\\t%0.<nunits>s, %2.<nunits>h, %3.<nunits>h"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlsl<f16quad>_high<mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+	(fma:VDQSF
+	 (float_extend:VDQSF
+	  (neg:<VFMLA_SEL_W>
+	   (vec_select:<VFMLA_SEL_W>
+	    (match_operand:<VFMLA_W> 2 "register_operand" "w")
+	    (match_operand:<VFMLA_W> 4 "vect_par_cnst_hi_half" ""))))
+	 (float_extend:VDQSF
+	  (vec_select:<VFMLA_SEL_W>
+	   (match_operand:<VFMLA_W> 3 "register_operand" "w")
+	   (match_operand:<VFMLA_W> 5 "vect_par_cnst_hi_half" "")))
+	 (match_operand:VDQSF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlsl2\\t%0.<nunits>s, %2.<nunits>h, %3.<nunits>h"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_expand "aarch64_fml<f16mac1>l_lane_lowv2sf"
+  [(set (match_operand:V2SF 0 "register_operand" "")
+	(unspec:V2SF [(match_operand:V2SF 1 "register_operand" "")
+			   (match_operand:V4HF 2 "register_operand" "")
+			   (match_operand:V4HF 3 "register_operand" "")
+			   (match_operand:SI 4 "aarch64_imm2" "")]
+	 VFMLA16_LOW))]
+  "TARGET_F16FML"
+{
+    rtx p1 = aarch64_simd_vect_par_cnst_half (V4HFmode,
+					      GET_MODE_NUNITS (V4HFmode),
+					      false);
+    rtx lane = GEN_INT (ENDIAN_LANE_N (GET_MODE_NUNITS (SImode), INTVAL (operands[4])));
+
+    emit_insn (gen_aarch64_simd_fml<f16mac1>l_lane_lowv2sf (operands[0],
+							    operands[1],
+							    operands[2],
+							    operands[3],
+							    p1, lane));
+    DONE;
+}
+)
+
+(define_expand "aarch64_fml<f16mac1>l_lane_highv2sf"
+  [(set (match_operand:V2SF 0 "register_operand" "")
+	(unspec:V2SF [(match_operand:V2SF 1 "register_operand" "")
+			   (match_operand:V4HF 2 "register_operand" "")
+			   (match_operand:V4HF 3 "register_operand" "")
+			   (match_operand:SI 4 "aarch64_imm2" "")]
+	 VFMLA16_HIGH))]
+  "TARGET_F16FML"
+{
+    rtx p1 = aarch64_simd_vect_par_cnst_half (V4HFmode,
+					      GET_MODE_NUNITS (V4HFmode),
+					      true);
+    rtx lane = GEN_INT (ENDIAN_LANE_N (GET_MODE_NUNITS (SImode), INTVAL (operands[4])));
+
+    emit_insn (gen_aarch64_simd_fml<f16mac1>l_lane_highv2sf (operands[0],
+							     operands[1],
+							     operands[2],
+							     operands[3],
+							     p1, lane));
+    DONE;
+})
+
+(define_insn "aarch64_simd_fmlal_lane_lowv2sf"
+  [(set (match_operand:V2SF 0 "register_operand" "=w")
+	(fma:V2SF
+	 (float_extend:V2SF
+	   (vec_select:V2HF
+	    (match_operand:V4HF 2 "register_operand" "w")
+	    (match_operand:V4HF 4 "vect_par_cnst_lo_half" "")))
+	 (float_extend:V2SF
+	   (vec_duplicate:V2HF
+	    (vec_select:HF
+	     (match_operand:V4HF 3 "register_operand" "x")
+	     (parallel [(match_operand:SI 5 "aarch64_imm2" "Ui2")]))))
+	 (match_operand:V2SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlal\\t%0.2s, %2.2h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlsl_lane_lowv2sf"
+  [(set (match_operand:V2SF 0 "register_operand" "=w")
+	(fma:V2SF
+	 (float_extend:V2SF
+	  (neg:V2HF
+	   (vec_select:V2HF
+	    (match_operand:V4HF 2 "register_operand" "w")
+	    (match_operand:V4HF 4 "vect_par_cnst_lo_half" ""))))
+	 (float_extend:V2SF
+	  (vec_duplicate:V2HF
+	   (vec_select:HF
+	    (match_operand:V4HF 3 "register_operand" "x")
+	    (parallel [(match_operand:SI 5 "aarch64_imm2" "Ui2")]))))
+	 (match_operand:V2SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlsl\\t%0.2s, %2.2h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlal_lane_highv2sf"
+  [(set (match_operand:V2SF 0 "register_operand" "=w")
+	(fma:V2SF
+	 (float_extend:V2SF
+	   (vec_select:V2HF
+	    (match_operand:V4HF 2 "register_operand" "w")
+	    (match_operand:V4HF 4 "vect_par_cnst_hi_half" "")))
+	 (float_extend:V2SF
+	   (vec_duplicate:V2HF
+	    (vec_select:HF
+	     (match_operand:V4HF 3 "register_operand" "x")
+	     (parallel [(match_operand:SI 5 "aarch64_imm2" "Ui2")]))))
+	 (match_operand:V2SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlal2\\t%0.2s, %2.2h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlsl_lane_highv2sf"
+  [(set (match_operand:V2SF 0 "register_operand" "=w")
+	(fma:V2SF
+	 (float_extend:V2SF
+	   (neg:V2HF
+	    (vec_select:V2HF
+	     (match_operand:V4HF 2 "register_operand" "w")
+	     (match_operand:V4HF 4 "vect_par_cnst_hi_half" ""))))
+	 (float_extend:V2SF
+	   (vec_duplicate:V2HF
+	    (vec_select:HF
+	     (match_operand:V4HF 3 "register_operand" "x")
+	     (parallel [(match_operand:SI 5 "aarch64_imm2" "Ui2")]))))
+	 (match_operand:V2SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlsl2\\t%0.2s, %2.2h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_expand "aarch64_fml<f16mac1>lq_laneq_lowv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "")
+	(unspec:V4SF [(match_operand:V4SF 1 "register_operand" "")
+			   (match_operand:V8HF 2 "register_operand" "")
+			   (match_operand:V8HF 3 "register_operand" "")
+			   (match_operand:SI 4 "aarch64_lane_imm3" "")]
+	 VFMLA16_LOW))]
+  "TARGET_F16FML"
+{
+    rtx p1 = aarch64_simd_vect_par_cnst_half (V8HFmode,
+					      GET_MODE_NUNITS (V8HFmode),
+					      false);
+    rtx lane = GEN_INT (ENDIAN_LANE_N (GET_MODE_NUNITS (SImode), INTVAL (operands[4])));
+
+    emit_insn (gen_aarch64_simd_fml<f16mac1>lq_laneq_lowv4sf (operands[0],
+							      operands[1],
+							      operands[2],
+							      operands[3],
+							      p1, lane));
+    DONE;
+})
+
+(define_expand "aarch64_fml<f16mac1>lq_laneq_highv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "")
+	(unspec:V4SF [(match_operand:V4SF 1 "register_operand" "")
+			   (match_operand:V8HF 2 "register_operand" "")
+			   (match_operand:V8HF 3 "register_operand" "")
+			   (match_operand:SI 4 "aarch64_lane_imm3" "")]
+	 VFMLA16_HIGH))]
+  "TARGET_F16FML"
+{
+    rtx p1 = aarch64_simd_vect_par_cnst_half (V8HFmode,
+					      GET_MODE_NUNITS (V8HFmode),
+					      true);
+    rtx lane = GEN_INT (ENDIAN_LANE_N (GET_MODE_NUNITS (SImode), INTVAL (operands[4])));
+
+    emit_insn (gen_aarch64_simd_fml<f16mac1>lq_laneq_highv4sf (operands[0],
+							       operands[1],
+							       operands[2],
+							       operands[3],
+							       p1, lane));
+    DONE;
+})
+
+(define_insn "aarch64_simd_fmlalq_laneq_lowv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+	(fma:V4SF
+	 (float_extend:V4SF
+	  (vec_select:V4HF
+	    (match_operand:V8HF 2 "register_operand" "w")
+	    (match_operand:V8HF 4 "vect_par_cnst_lo_half" "")))
+	 (float_extend:V4SF
+	  (vec_duplicate:V4HF
+	   (vec_select:HF
+	    (match_operand:V8HF 3 "register_operand" "x")
+	    (parallel [(match_operand:SI 5 "aarch64_lane_imm3" "Ui7")]))))
+	 (match_operand:V4SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlal\\t%0.4s, %2.4h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlslq_laneq_lowv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+	(fma:V4SF
+	  (float_extend:V4SF
+	   (neg:V4HF
+	    (vec_select:V4HF
+	     (match_operand:V8HF 2 "register_operand" "w")
+	     (match_operand:V8HF 4 "vect_par_cnst_lo_half" ""))))
+	 (float_extend:V4SF
+	  (vec_duplicate:V4HF
+	   (vec_select:HF
+	    (match_operand:V8HF 3 "register_operand" "x")
+	    (parallel [(match_operand:SI 5 "aarch64_lane_imm3" "Ui7")]))))
+	 (match_operand:V4SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlsl\\t%0.4s, %2.4h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlalq_laneq_highv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+	(fma:V4SF
+	 (float_extend:V4SF
+	  (vec_select:V4HF
+	    (match_operand:V8HF 2 "register_operand" "w")
+	    (match_operand:V8HF 4 "vect_par_cnst_hi_half" "")))
+	 (float_extend:V4SF
+	  (vec_duplicate:V4HF
+	   (vec_select:HF
+	    (match_operand:V8HF 3 "register_operand" "x")
+	    (parallel [(match_operand:SI 5 "aarch64_lane_imm3" "Ui7")]))))
+	 (match_operand:V4SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlal2\\t%0.4s, %2.4h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlslq_laneq_highv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+	(fma:V4SF
+	 (float_extend:V4SF
+	  (neg:V4HF
+	   (vec_select:V4HF
+	    (match_operand:V8HF 2 "register_operand" "w")
+	    (match_operand:V8HF 4 "vect_par_cnst_hi_half" ""))))
+	 (float_extend:V4SF
+	  (vec_duplicate:V4HF
+	   (vec_select:HF
+	    (match_operand:V8HF 3 "register_operand" "x")
+	    (parallel [(match_operand:SI 5 "aarch64_lane_imm3" "Ui7")]))))
+	 (match_operand:V4SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlsl2\\t%0.4s, %2.4h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_expand "aarch64_fml<f16mac1>l_laneq_lowv2sf"
+  [(set (match_operand:V2SF 0 "register_operand" "")
+	(unspec:V2SF [(match_operand:V2SF 1 "register_operand" "")
+		      (match_operand:V4HF 2 "register_operand" "")
+		      (match_operand:V8HF 3 "register_operand" "")
+		      (match_operand:SI 4 "aarch64_lane_imm3" "")]
+	 VFMLA16_LOW))]
+  "TARGET_F16FML"
+{
+    rtx p1 = aarch64_simd_vect_par_cnst_half (V4HFmode,
+					      GET_MODE_NUNITS (V4HFmode),
+					      false);
+    rtx lane = GEN_INT (ENDIAN_LANE_N (GET_MODE_NUNITS (SImode), INTVAL (operands[4])));
+
+    emit_insn (gen_aarch64_simd_fml<f16mac1>l_laneq_lowv2sf (operands[0],
+							     operands[1],
+							     operands[2],
+							     operands[3],
+							     p1, lane));
+    DONE;
+
+})
+
+(define_expand "aarch64_fml<f16mac1>l_laneq_highv2sf"
+  [(set (match_operand:V2SF 0 "register_operand" "")
+	(unspec:V2SF [(match_operand:V2SF 1 "register_operand" "")
+		      (match_operand:V4HF 2 "register_operand" "")
+		      (match_operand:V8HF 3 "register_operand" "")
+		      (match_operand:SI 4 "aarch64_lane_imm3" "")]
+	 VFMLA16_HIGH))]
+  "TARGET_F16FML"
+{
+    rtx p1 = aarch64_simd_vect_par_cnst_half (V4HFmode,
+					      GET_MODE_NUNITS(V4HFmode),
+					      true);
+    rtx lane = GEN_INT (ENDIAN_LANE_N (GET_MODE_NUNITS (SImode), INTVAL (operands[4])));
+
+    emit_insn (gen_aarch64_simd_fml<f16mac1>l_laneq_highv2sf (operands[0],
+							      operands[1],
+							      operands[2],
+							      operands[3],
+							      p1, lane));
+    DONE;
+
+})
+
+(define_insn "aarch64_simd_fmlal_laneq_lowv2sf"
+  [(set (match_operand:V2SF 0 "register_operand" "=w")
+	(fma:V2SF
+	 (float_extend:V2SF
+	   (vec_select:V2HF
+	    (match_operand:V4HF 2 "register_operand" "w")
+	    (match_operand:V4HF 4 "vect_par_cnst_lo_half" "")))
+	 (float_extend:V2SF
+	  (vec_duplicate:V2HF
+	   (vec_select:HF
+	    (match_operand:V8HF 3 "register_operand" "x")
+	    (parallel [(match_operand:SI 5 "aarch64_lane_imm3" "Ui7")]))))
+	 (match_operand:V2SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlal\\t%0.2s, %2.2h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlsl_laneq_lowv2sf"
+  [(set (match_operand:V2SF 0 "register_operand" "=w")
+	(fma:V2SF
+	 (float_extend:V2SF
+	  (neg:V2HF
+	   (vec_select:V2HF
+	    (match_operand:V4HF 2 "register_operand" "w")
+	    (match_operand:V4HF 4 "vect_par_cnst_lo_half" ""))))
+	 (float_extend:V2SF
+	  (vec_duplicate:V2HF
+	   (vec_select:HF
+	    (match_operand:V8HF 3 "register_operand" "x")
+	    (parallel [(match_operand:SI 5 "aarch64_lane_imm3" "Ui7")]))))
+	 (match_operand:V2SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlsl\\t%0.2s, %2.2h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlal_laneq_highv2sf"
+  [(set (match_operand:V2SF 0 "register_operand" "=w")
+	(fma:V2SF
+	 (float_extend:V2SF
+	   (vec_select:V2HF
+	    (match_operand:V4HF 2 "register_operand" "w")
+	    (match_operand:V4HF 4 "vect_par_cnst_hi_half" "")))
+	 (float_extend:V2SF
+	  (vec_duplicate:V2HF
+	   (vec_select:HF
+	    (match_operand:V8HF 3 "register_operand" "x")
+	    (parallel [(match_operand:SI 5 "aarch64_lane_imm3" "Ui7")]))))
+	 (match_operand:V2SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlal2\\t%0.2s, %2.2h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlsl_laneq_highv2sf"
+  [(set (match_operand:V2SF 0 "register_operand" "=w")
+	(fma:V2SF
+	 (float_extend:V2SF
+	  (neg:V2HF
+	   (vec_select:V2HF
+	    (match_operand:V4HF 2 "register_operand" "w")
+	    (match_operand:V4HF 4 "vect_par_cnst_hi_half" ""))))
+	 (float_extend:V2SF
+	  (vec_duplicate:V2HF
+	   (vec_select:HF
+	    (match_operand:V8HF 3 "register_operand" "x")
+	    (parallel [(match_operand:SI 5 "aarch64_lane_imm3" "Ui7")]))))
+	 (match_operand:V2SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlsl2\\t%0.2s, %2.2h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_expand "aarch64_fml<f16mac1>lq_lane_lowv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "")
+	(unspec:V4SF [(match_operand:V4SF 1 "register_operand" "")
+		      (match_operand:V8HF 2 "register_operand" "")
+		      (match_operand:V4HF 3 "register_operand" "")
+		      (match_operand:SI 4 "aarch64_imm2" "")]
+	 VFMLA16_LOW))]
+  "TARGET_F16FML"
+{
+    rtx p1 = aarch64_simd_vect_par_cnst_half (V8HFmode,
+					      GET_MODE_NUNITS (V8HFmode),
+					      false);
+    rtx lane = GEN_INT (ENDIAN_LANE_N (GET_MODE_NUNITS (SImode), INTVAL (operands[4])));
+
+    emit_insn (gen_aarch64_simd_fml<f16mac1>lq_lane_lowv4sf (operands[0],
+							     operands[1],
+							     operands[2],
+							     operands[3],
+							     p1, lane));
+    DONE;
+})
+
+(define_expand "aarch64_fml<f16mac1>lq_lane_highv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "")
+	(unspec:V4SF [(match_operand:V4SF 1 "register_operand" "")
+		      (match_operand:V8HF 2 "register_operand" "")
+		      (match_operand:V4HF 3 "register_operand" "")
+		      (match_operand:SI 4 "aarch64_imm2" "")]
+	 VFMLA16_HIGH))]
+  "TARGET_F16FML"
+{
+    rtx p1 = aarch64_simd_vect_par_cnst_half (V8HFmode,
+					      GET_MODE_NUNITS (V8HFmode),
+					      true);
+    rtx lane = GEN_INT (ENDIAN_LANE_N (GET_MODE_NUNITS (SImode), INTVAL (operands[4])));
+
+    emit_insn (gen_aarch64_simd_fml<f16mac1>lq_lane_highv4sf (operands[0],
+							      operands[1],
+							      operands[2],
+							      operands[3],
+							      p1, lane));
+    DONE;
+})
+
+(define_insn "aarch64_simd_fmlalq_lane_lowv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+	(fma:V4SF
+	 (float_extend:V4SF
+	  (vec_select:V4HF
+	   (match_operand:V8HF 2 "register_operand" "w")
+	   (match_operand:V8HF 4 "vect_par_cnst_lo_half" "")))
+	 (float_extend:V4SF
+	  (vec_duplicate:V4HF
+	   (vec_select:HF
+	    (match_operand:V4HF 3 "register_operand" "x")
+	    (parallel [(match_operand:SI 5 "aarch64_imm2" "Ui2")]))))
+	 (match_operand:V4SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlal\\t%0.4s, %2.4h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlslq_lane_lowv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+	(fma:V4SF
+	 (float_extend:V4SF
+	  (neg:V4HF
+	   (vec_select:V4HF
+	    (match_operand:V8HF 2 "register_operand" "w")
+	    (match_operand:V8HF 4 "vect_par_cnst_lo_half" ""))))
+	 (float_extend:V4SF
+	  (vec_duplicate:V4HF
+	   (vec_select:HF
+	    (match_operand:V4HF 3 "register_operand" "x")
+	    (parallel [(match_operand:SI 5 "aarch64_imm2" "Ui2")]))))
+	 (match_operand:V4SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlsl\\t%0.4s, %2.4h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlalq_lane_highv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+	(fma:V4SF
+	 (float_extend:V4SF
+	  (vec_select:V4HF
+	   (match_operand:V8HF 2 "register_operand" "w")
+	   (match_operand:V8HF 4 "vect_par_cnst_hi_half" "")))
+	 (float_extend:V4SF
+	  (vec_duplicate:V4HF
+	   (vec_select:HF
+	    (match_operand:V4HF 3 "register_operand" "x")
+	    (parallel [(match_operand:SI 5 "aarch64_imm2" "Ui2")]))))
+	 (match_operand:V4SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlal2\\t%0.4s, %2.4h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlslq_lane_highv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+	(fma:V4SF
+	 (float_extend:V4SF
+	  (neg:V4HF
+	   (vec_select:V4HF
+	    (match_operand:V8HF 2 "register_operand" "w")
+	    (match_operand:V8HF 4 "vect_par_cnst_hi_half" ""))))
+	 (float_extend:V4SF
+	  (vec_duplicate:V4HF
+	   (vec_select:HF
+	    (match_operand:V4HF 3 "register_operand" "x")
+	    (parallel [(match_operand:SI 5 "aarch64_imm2" "Ui2")]))))
+	 (match_operand:V4SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlsl2\\t%0.4s, %2.4h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
 ;; pmull
 
 (define_insn "aarch64_crypto_pmulldi"
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index b37f1d58a857afb0ff6320acd234c39abadd0fb8..78fa04001bba129a90d577f36a1f239054e34b3f 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -155,6 +155,8 @@ extern unsigned aarch64_architecture_version;
 #define AARCH64_FL_V8_4	      (1 << 15)  /* Has ARMv8.4-A features.  */
 #define AARCH64_FL_SM4	      (1 << 16)  /* Has ARMv8.4-A SM3 and SM4.  */
 #define AARCH64_FL_SHA3	      (1 << 17)  /* Has ARMv8.4-a SHA3 and SHA512.  */
+#define AARCH64_FL_F16FML     (1 << 18)  /* Has ARMv8.4-a FP16 extensions.  */
+
 /* Has FP and SIMD.  */
 #define AARCH64_FL_FPSIMD     (AARCH64_FL_FP | AARCH64_FL_SIMD)
 
@@ -171,7 +173,7 @@ extern unsigned aarch64_architecture_version;
 #define AARCH64_FL_FOR_ARCH8_3			\
   (AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_V8_3)
 #define AARCH64_FL_FOR_ARCH8_4			\
-  (AARCH64_FL_FOR_ARCH8_3 | AARCH64_FL_V8_4)
+  (AARCH64_FL_FOR_ARCH8_3 | AARCH64_FL_V8_4 | AARCH64_FL_F16FML)
 
 /* Macros to test ISA flags.  */
 
@@ -190,6 +192,7 @@ extern unsigned aarch64_architecture_version;
 #define AARCH64_ISA_V8_4	   (aarch64_isa_flags & AARCH64_FL_V8_4)
 #define AARCH64_ISA_SM4	           (aarch64_isa_flags & AARCH64_FL_SM4)
 #define AARCH64_ISA_SHA3	   (aarch64_isa_flags & AARCH64_FL_SHA3)
+#define AARCH64_ISA_F16FML	   (aarch64_isa_flags & AARCH64_FL_F16FML)
 
 /* Crypto is an optional extension to AdvSIMD.  */
 #define TARGET_CRYPTO (TARGET_SIMD && AARCH64_ISA_CRYPTO)
@@ -206,6 +209,9 @@ extern unsigned aarch64_architecture_version;
 /* SM is an optional extension to AdvSIMD.  */
 #define TARGET_SM4 (TARGET_SIMD && AARCH64_ISA_SM4)
 
+/* FP16FML is an optional extension to AdvSIMD.  */
+#define TARGET_F16FML (TARGET_SIMD && AARCH64_ISA_F16FML && TARGET_FP_F16INST)
+
 /* CRC instructions that can be enabled through +crc arch extension.  */
 #define TARGET_CRC32 (AARCH64_ISA_CRC)
 
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index b48299af9da51b4da4456ac58b0e44cd5ba68504..91cb9f618b7931faa4b25aa8e64d93730394762d 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -31761,6 +31761,195 @@ vbcaxq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
 }
 #pragma GCC pop_options
 
+#pragma GCC push_options
+#pragma GCC target(("arch=armv8.2-a+fp16fml"))
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlal_low_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_aarch64_fmlal_lowv2sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlsl_low_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_aarch64_fmlsl_lowv2sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlalq_low_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b)
+{
+  return __builtin_aarch64_fmlalq_lowv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlslq_low_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b)
+{
+  return __builtin_aarch64_fmlslq_lowv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlal_high_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_aarch64_fmlal_highv2sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlsl_high_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_aarch64_fmlsl_highv2sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlalq_high_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b)
+{
+  return __builtin_aarch64_fmlalq_highv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlslq_high_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b)
+{
+  return __builtin_aarch64_fmlslq_highv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlal_lane_low_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b,
+		     const int __lane)
+{
+  return __builtin_aarch64_fmlal_lane_lowv2sf (__r, __a, __b, __lane);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlsl_lane_low_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b,
+		     const int __lane)
+{
+  return __builtin_aarch64_fmlsl_lane_lowv2sf (__r, __a, __b, __lane);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlal_laneq_low_u32 (float32x2_t __r, float16x4_t __a, float16x8_t __b,
+		      const int __lane)
+{
+  return __builtin_aarch64_fmlal_laneq_lowv2sf (__r, __a, __b, __lane);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlsl_laneq_low_u32 (float32x2_t __r, float16x4_t __a, float16x8_t __b,
+		      const int __lane)
+{
+  return __builtin_aarch64_fmlsl_laneq_lowv2sf (__r, __a, __b, __lane);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlalq_lane_low_u32 (float32x4_t __r, float16x8_t __a, float16x4_t __b,
+		      const int __lane)
+{
+  return __builtin_aarch64_fmlalq_lane_lowv4sf (__r, __a, __b, __lane);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlslq_lane_low_u32 (float32x4_t __r, float16x8_t __a, float16x4_t __b,
+		      const int __lane)
+{
+  return __builtin_aarch64_fmlslq_lane_lowv4sf (__r, __a, __b, __lane);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlalq_laneq_low_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b,
+		       const int __lane)
+{
+  return __builtin_aarch64_fmlalq_laneq_lowv4sf (__r, __a, __b, __lane);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlslq_laneq_low_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b,
+		      const int __lane)
+{
+  return __builtin_aarch64_fmlslq_laneq_lowv4sf (__r, __a, __b, __lane);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlal_lane_high_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b,
+		     const int __lane)
+{
+  return __builtin_aarch64_fmlal_lane_highv2sf (__r, __a, __b, __lane);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlsl_lane_high_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b,
+		     const int __lane)
+{
+  return __builtin_aarch64_fmlsl_lane_highv2sf (__r, __a, __b, __lane);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlal_laneq_high_u32 (float32x2_t __r, float16x4_t __a, float16x8_t __b,
+		      const int __lane)
+{
+  return __builtin_aarch64_fmlal_laneq_highv2sf (__r, __a, __b, __lane);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlsl_laneq_high_u32 (float32x2_t __r, float16x4_t __a, float16x8_t __b,
+		      const int __lane)
+{
+  return __builtin_aarch64_fmlsl_laneq_highv2sf (__r, __a, __b, __lane);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlalq_lane_high_u32 (float32x4_t __r, float16x8_t __a, float16x4_t __b,
+		      const int __lane)
+{
+  return __builtin_aarch64_fmlalq_lane_highv4sf (__r, __a, __b, __lane);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlslq_lane_high_u32 (float32x4_t __r, float16x8_t __a, float16x4_t __b,
+		      const int __lane)
+{
+  return __builtin_aarch64_fmlslq_lane_highv4sf (__r, __a, __b, __lane);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlalq_laneq_high_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b,
+		       const int __lane)
+{
+  return __builtin_aarch64_fmlalq_laneq_highv4sf (__r, __a, __b, __lane);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlslq_laneq_high_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b,
+		      const int __lane)
+{
+  return __builtin_aarch64_fmlslq_laneq_highv4sf (__r, __a, __b, __lane);
+}
+
+#pragma GCC pop_options
+
 #undef __aarch64_vget_lane_any
 
 #undef __aarch64_vdup_lane_any
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index a44f4264ab7104b041c53c89515a88f7c876f703..b9d5ba1178cf8c78cc7bf8f6e7b9bec2a3807a71 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -151,6 +151,12 @@
   (and (match_code "const_int")
        (match_test "(unsigned HOST_WIDE_INT) ival <= 4")))
 
+(define_constraint "Ui7"
+  "@internal
+  A constraint that matches the integers 0...7."
+  (and (match_code "const_int")
+       (match_test "(unsigned HOST_WIDE_INT) ival <= 7")))
+
 (define_constraint "Up3"
   "@internal
   A constraint that matches the integers 2^(0...4)."
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 92124dacf953e05a18b191a81e1d4490409dca27..fcbdd609da32626cad00192d38197fa41348477b 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -374,6 +374,10 @@
     UNSPEC_SHA512H2     ; Used in aarch64-simd.md.
     UNSPEC_SHA512SU0    ; Used in aarch64-simd.md.
     UNSPEC_SHA512SU1    ; Used in aarch64-simd.md.
+    UNSPEC_FMLAL	; Used in aarch64-simd.md.
+    UNSPEC_FMLSL	; Used in aarch64-simd.md.
+    UNSPEC_FMLAL2	; Used in aarch64-simd.md.
+    UNSPEC_FMLSL2	; Used in aarch64-simd.md.
 ])
 
 ;; ------------------------------------------------------------------
@@ -849,6 +853,15 @@
 ;; No need of iterator for -fPIC as it use got_lo12 for both modes.
 (define_mode_attr got_modifier [(SI "gotpage_lo14") (DI "gotpage_lo15")])
 
+;; Width of 2nd and 3rd arguments to fp16 vector multiply add/sub
+(define_mode_attr VFMLA_W [(V2SF "V4HF") (V4SF "V8HF")])
+
+(define_mode_attr VFMLA_SEL_W [(V2SF "V2HF") (V4SF "V4HF")])
+
+(define_mode_attr f16quad [(V2SF "") (V4SF "q")])
+
+(define_code_attr f16mac [(plus "a") (minus "s")])
+
 ;; -------------------------------------------------------------------
 ;; Code Iterators
 ;; -------------------------------------------------------------------
@@ -1153,6 +1166,12 @@
 
 (define_int_iterator CRYPTO_SM3PART [UNSPEC_SM3PARTW1 UNSPEC_SM3PARTW2])
 
+;; Iterators for fp16 operations
+
+(define_int_iterator VFMLA16_LOW [UNSPEC_FMLAL UNSPEC_FMLSL])
+
+(define_int_iterator VFMLA16_HIGH [UNSPEC_FMLAL2 UNSPEC_FMLSL2])
+
 ;; Iterators for atomic operations.
 
 (define_int_iterator ATOMIC_LDOP
@@ -1332,3 +1351,6 @@
 			   (UNSPEC_SM3TT2A "2a") (UNSPEC_SM3TT2B "2b")])
 
 (define_int_attr sm3part_op [(UNSPEC_SM3PARTW1 "1") (UNSPEC_SM3PARTW2 "2")])
+
+(define_int_attr f16mac1 [(UNSPEC_FMLAL "a") (UNSPEC_FMLSL "s")
+			  (UNSPEC_FMLAL2 "a") (UNSPEC_FMLSL2 "s")])
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index e0bd18d91e2cf187cb3a2fd7650112ae027d0b50..750d30e2e40e5621b8a925c8d96425d720f34c52 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -166,6 +166,12 @@
   (and (match_code "const_int")
        (match_test "UINTVAL (op) <= 3")))
 
+;; The imm3 field is a 3-bit field that only accepts immediates in the
+;; range 0..7.
+(define_predicate "aarch64_lane_imm3"
+  (and (match_code "const_int")
+       (match_test "UINTVAL (op) <= 7")))
+
 ;; An immediate that fits into 24 bits.
 (define_predicate "aarch64_imm24"
   (and (match_code "const_int")
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 45f00592d754743e62951969713b3797776b2abd..4c7429df395319fa2f940c6d4b3228182541a106 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -14514,6 +14514,10 @@ Enable Round Double Multiply Accumulate instructions.  This is on by default
 for @option{-march=armv8.1-a}.
 @item fp16
 Enable FP16 extension.  This also enables floating-point instructions.
+@item fp16fml
+Enable FP16 fmla extension.  This also enables FP16 extensions and
+floating-point instructions. This option is enabled by default for @option{-march=armv8.4-a}. Use of this option with architectures prior to Armv8.2-A is not supported.
+
 @item rcpc
 Enable the RcPc extension.  This does not change code generation from GCC,
 but is passed on to the assembler, enabling inline asm statements to use
diff --git a/gcc/testsuite/gcc.target/aarch64/fp16_fmul_high.h b/gcc/testsuite/gcc.target/aarch64/fp16_fmul_high.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c86bd19153cc0888f7b28f36d141b9fe08f535e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fp16_fmul_high.h
@@ -0,0 +1,25 @@
+#include "arm_neon.h"
+
+float32x2_t
+test_vfmlal_high_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
+{
+  return vfmlal_high_u32 (r, a, b);
+}
+
+float32x4_t
+test_vfmlalq_high_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
+{
+  return vfmlalq_high_u32 (r, a, b);
+}
+
+float32x2_t
+test_vfmlsl_high_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
+{
+  return vfmlsl_high_u32 (r, a, b);
+}
+
+float32x4_t
+test_vfmlslq_high_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
+{
+  return vfmlslq_high_u32 (r, a, b);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/fp16_fmul_high_1.c b/gcc/testsuite/gcc.target/aarch64/fp16_fmul_high_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..26cf219f01d1afde846a904520ba0581275ef0af
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fp16_fmul_high_1.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-march=armv8.2-a+fp16fml" } */
+
+#include "fp16_fmul_high.h"
+
+/* { dg-final { scan-assembler-times "fmlal2\\tv\[0-9\]+\.2s, v\[0-9\]+\.2h, v\[0-9\]+\.2h" 1 } } */
+/* { dg-final { scan-assembler-times "fmlal2\\tv\[0-9\]+\.4s, v\[0-9\]+\.4h, v\[0-9\]+\.4h" 1 } } */
+/* { dg-final { scan-assembler-times "fmlsl2\\tv\[0-9\]+\.2s, v\[0-9\]+\.2h, v\[0-9\]+\.2h" 1 } } */
+/* { dg-final { scan-assembler-times "fmlsl2\\tv\[0-9\]+\.4s, v\[0-9\]+\.4h, v\[0-9\]+\.4h" 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/fp16_fmul_high_2.c b/gcc/testsuite/gcc.target/aarch64/fp16_fmul_high_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..a82938d13c0bdebaa9d4084e92246f1722b7df88
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fp16_fmul_high_2.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-march=armv8.3-a+fp16fml" } */
+
+#include "fp16_fmul_high.h"
+
+/* { dg-final { scan-assembler-times "fmlal2\\tv\[0-9\]+\.2s, v\[0-9\]+\.2h, v\[0-9\]+\.2h" 1 } } */
+/* { dg-final { scan-assembler-times "fmlal2\\tv\[0-9\]+\.4s, v\[0-9\]+\.4h, v\[0-9\]+\.4h" 1 } } */
+/* { dg-final { scan-assembler-times "fmlsl2\\tv\[0-9\]+\.2s, v\[0-9\]+\.2h, v\[0-9\]+\.2h" 1 } } */
+/* { dg-final { scan-assembler-times "fmlsl2\\tv\[0-9\]+\.4s, v\[0-9\]+\.4h, v\[0-9\]+\.4h" 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/fp16_fmul_high_3.c b/gcc/testsuite/gcc.target/aarch64/fp16_fmul_high_3.c
new file mode 100644
index 0000000000000000000000000000000000000000..be1d35a42167e3a0476403101ab6554038f57380
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fp16_fmul_high_3.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-march=armv8.4-a+fp16" } */
+
+#include "fp16_fmul_high.h"
+
+/* { dg-final { scan-assembler-times "fmlal2\\tv\[0-9\]+\.2s, v\[0-9\]+\.2h, v\[0-9\]+\.2h" 1 } } */
+/* { dg-final { scan-assembler-times "fmlal2\\tv\[0-9\]+\.4s, v\[0-9\]+\.4h, v\[0-9\]+\.4h" 1 } } */
+/* { dg-final { scan-assembler-times "fmlsl2\\tv\[0-9\]+\.2s, v\[0-9\]+\.2h, v\[0-9\]+\.2h" 1 } } */
+/* { dg-final { scan-assembler-times "fmlsl2\\tv\[0-9\]+\.4s, v\[0-9\]+\.4h, v\[0-9\]+\.4h" 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/fp16_fmul_lane_high.h b/gcc/testsuite/gcc.target/aarch64/fp16_fmul_lane_high.h
new file mode 100644
index 0000000000000000000000000000000000000000..1039347865e0bc79dfe351fd52f36964e7c41188
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fp16_fmul_lane_high.h
@@ -0,0 +1,49 @@
+#include "arm_neon.h"
+
+float32x2_t
+test_vfmlal_lane_high_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
+{
+  return vfmlal_lane_high_u32 (r, a, b, 0);
+}
+
+float32x2_t
+tets_vfmlsl_lane_high_u32  (float32x2_t r, float16x4_t a, float16x4_t b)
+{
+  return vfmlsl_lane_high_u32 (r, a, b, 0);
+}
+
+float32x2_t
+test_vfmlal_laneq_high_u32 (float32x2_t r, float16x4_t a, float16x8_t b)
+{
+  return vfmlal_laneq_high_u32 (r, a, b, 6);
+}
+
+float32x2_t
+test_vfmlsl_laneq_high_u32 (float32x2_t r, float16x4_t a, float16x8_t b)
+{
+  return vfmlsl_laneq_high_u32 (r, a, b, 6);
+}
+
+float32x4_t
+test_vfmlalq_lane_high_u32 (float32x4_t r, float16x8_t a, float16x4_t b)
+{
+  return vfmlalq_lane_high_u32 (r, a, b, 1);
+}
+
+float32x4_t
+test_vfmlslq_lane_high_u32 (float32x4_t r, float16x8_t a, float16x4_t b)
+{
+  return vfmlslq_lane_high_u32 (r, a, b, 1);
+}
+
+float32x4_t
+test_vfmlalq_laneq_high_u32  (float32x4_t r, float16x8_t a, float16x8_t b)
+{
+  return vfmlalq_laneq_high_u32 (r, a, b, 7);
+}
+
+float32x4_t
+test_vfmlslq_laneq_high_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
+{
+  return vfmlslq_laneq_high_u32 (r, a, b, 7);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/fp16_fmul_lane_high_1.c b/gcc/testsuite/gcc.target/aarch64/fp16_fmul_lane_high_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..c1534edd73d0d9c765b70be2569d891c66d5a667
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fp16_fmul_lane_high_1.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-march=armv8.2-a+fp16fml" } */
+
+#include "fp16_fmul_lane_high.h"
+
+/* { dg-final { scan-assembler-times "fmlal2\\tv\[0-9\]+\.2s, v\[0-9\]+\.2h, v\[0-9\]+\.h\\\[0\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlsl2\\tv\[0-9\]+\.2s, v\[0-9\]+\.2h, v\[0-9\]+\.h\\\[0\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlal2\\tv\[0-9\]+\.2s, v\[0-9\]+\.2h, v\[0-9\]+\.h\\\[6\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlsl2\\tv\[0-9\]+\.2s, v\[0-9\]+\.2h, v\[0-9\]+\.h\\\[6\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlal2\\tv\[0-9\]+\.4s, v\[0-9\]+\.4h, v\[0-9\]+\.h\\\[1\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlsl2\\tv\[0-9\]+\.4s, v\[0-9\]+\.4h, v\[0-9\]+\.h\\\[1\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlal2\\tv\[0-9\]+\.4s, v\[0-9\]+\.4h, v\[0-9\]+\.h\\\[7\\\]" 1 } } */ 
+/* { dg-final { scan-assembler-times "fmlsl2\\tv\[0-9\]+\.4s, v\[0-9\]+\.4h, v\[0-9\]+\.h\\\[7\\\]" 1 } } */ 
diff --git a/gcc/testsuite/gcc.target/aarch64/fp16_fmul_lane_high_2.c b/gcc/testsuite/gcc.target/aarch64/fp16_fmul_lane_high_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..929a46c4752c18cbecd41e5579cad6574a4501cd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fp16_fmul_lane_high_2.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-march=armv8.3-a+fp16fml" } */
+
+#include "fp16_fmul_lane_high.h"
+
+/* { dg-final { scan-assembler-times "fmlal2\\tv\[0-9\]+\.2s, v\[0-9\]+\.2h, v\[0-9\]+\.h\\\[0\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlsl2\\tv\[0-9\]+\.2s, v\[0-9\]+\.2h, v\[0-9\]+\.h\\\[0\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlal2\\tv\[0-9\]+\.2s, v\[0-9\]+\.2h, v\[0-9\]+\.h\\\[6\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlsl2\\tv\[0-9\]+\.2s, v\[0-9\]+\.2h, v\[0-9\]+\.h\\\[6\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlal2\\tv\[0-9\]+\.4s, v\[0-9\]+\.4h, v\[0-9\]+\.h\\\[1\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlsl2\\tv\[0-9\]+\.4s, v\[0-9\]+\.4h, v\[0-9\]+\.h\\\[1\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlal2\\tv\[0-9\]+\.4s, v\[0-9\]+\.4h, v\[0-9\]+\.h\\\[7\\\]" 1 } } */ 
+/* { dg-final { scan-assembler-times "fmlsl2\\tv\[0-9\]+\.4s, v\[0-9\]+\.4h, v\[0-9\]+\.h\\\[7\\\]" 1 } } */ 
diff --git a/gcc/testsuite/gcc.target/aarch64/fp16_fmul_lane_high_3.c b/gcc/testsuite/gcc.target/aarch64/fp16_fmul_lane_high_3.c
new file mode 100644
index 0000000000000000000000000000000000000000..65a35e97de5273747910c2f8b95d06ace12185d5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fp16_fmul_lane_high_3.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-march=armv8.4-a+fp16" } */
+
+#include "fp16_fmul_lane_high.h"
+
+/* { dg-final { scan-assembler-times "fmlal2\\tv\[0-9\]+\.2s, v\[0-9\]+\.2h, v\[0-9\]+\.h\\\[0\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlsl2\\tv\[0-9\]+\.2s, v\[0-9\]+\.2h, v\[0-9\]+\.h\\\[0\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlal2\\tv\[0-9\]+\.2s, v\[0-9\]+\.2h, v\[0-9\]+\.h\\\[6\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlsl2\\tv\[0-9\]+\.2s, v\[0-9\]+\.2h, v\[0-9\]+\.h\\\[6\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlal2\\tv\[0-9\]+\.4s, v\[0-9\]+\.4h, v\[0-9\]+\.h\\\[1\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlsl2\\tv\[0-9\]+\.4s, v\[0-9\]+\.4h, v\[0-9\]+\.h\\\[1\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlal2\\tv\[0-9\]+\.4s, v\[0-9\]+\.4h, v\[0-9\]+\.h\\\[7\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlsl2\\tv\[0-9\]+\.4s, v\[0-9\]+\.4h, v\[0-9\]+\.h\\\[7\\\]" 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/fp16_fmul_lane_low.h b/gcc/testsuite/gcc.target/aarch64/fp16_fmul_lane_low.h
new file mode 100644
index 0000000000000000000000000000000000000000..b689741bdb006e89f14f29b803ba6d38a62b387e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fp16_fmul_lane_low.h
@@ -0,0 +1,49 @@
+#include "arm_neon.h"
+
+float32x2_t
+test_vfmlal_lane_low_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
+{
+  return vfmlal_lane_low_u32 (r, a, b, 0);
+}
+
+float32x2_t
+test_vfmlsl_lane_low_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
+{
+  return vfmlsl_lane_low_u32 (r, a, b, 0);
+}
+
+float32x2_t
+test_vfmlal_laneq_low_u32 (float32x2_t r, float16x4_t a, float16x8_t b)
+{
+  return vfmlal_laneq_low_u32 (r, a, b, 6);
+}
+
+float32x2_t
+test_vfmlsl_laneq_low_u32 (float32x2_t r, float16x4_t a, float16x8_t b)
+{
+  return vfmlsl_laneq_low_u32 (r, a, b, 6);
+}
+
+float32x4_t
+test_vfmlalq_lane_low_u32 (float32x4_t r, float16x8_t a, float16x4_t b)
+{
+  return vfmlalq_lane_low_u32 (r, a, b, 1);
+}
+
+float32x4_t
+test_vfmlslq_lane_low_u32 (float32x4_t r, float16x8_t a, float16x4_t b)
+{
+  return vfmlslq_lane_low_u32 (r, a, b, 1);
+}
+
+float32x4_t
+test_vfmlalq_laneq_low_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
+{
+  return vfmlalq_laneq_low_u32 (r, a, b, 7);
+}
+
+float32x4_t
+test_vfmlslq_laneq_low_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
+{
+  return vfmlslq_laneq_low_u32 (r, a, b, 7);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/fp16_fmul_lane_low_1.c b/gcc/testsuite/gcc.target/aarch64/fp16_fmul_lane_low_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..1b56845bd320b1f34e6d3b76a848a8aea1cdf17c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fp16_fmul_lane_low_1.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-march=armv8.2-a+fp16fml" } */
+
+#include "fp16_fmul_lane_low.h"
+
+/* { dg-final { scan-assembler-times "fmlal\\tv\[0-9\]+\.2s, v\[0-9\]+\.2h, v\[0-9\]+\.h\\\[0\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlsl\\tv\[0-9\]+\.2s, v\[0-9\]+\.2h, v\[0-9\]+\.h\\\[0\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlal\\tv\[0-9\]+\.2s, v\[0-9\]+\.2h, v\[0-9\]+\.h\\\[6\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlsl\\tv\[0-9\]+\.2s, v\[0-9\]+\.2h, v\[0-9\]+\.h\\\[6\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlal\\tv\[0-9\]+\.4s, v\[0-9\]+\.4h, v\[0-9\]+\.h\\\[1\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlsl\\tv\[0-9\]+\.4s, v\[0-9\]+\.4h, v\[0-9\]+\.h\\\[1\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlal\\tv\[0-9\]+\.4s, v\[0-9\]+\.4h, v\[0-9\]+\.h\\\[7\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlsl\\tv\[0-9\]+\.4s, v\[0-9\]+\.4h, v\[0-9\]+\.h\\\[7\\\]" 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/fp16_fmul_lane_low_2.c b/gcc/testsuite/gcc.target/aarch64/fp16_fmul_lane_low_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..d42aef2e149f42de7a16161857a75e1c9633e906
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fp16_fmul_lane_low_2.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-march=armv8.3-a+fp16fml" } */
+
+#include "fp16_fmul_lane_low.h"
+
+/* { dg-final { scan-assembler-times "fmlal\\tv\[0-9\]+\.2s, v\[0-9\]+\.2h, v\[0-9\]+\.h\\\[0\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlsl\\tv\[0-9\]+\.2s, v\[0-9\]+\.2h, v\[0-9\]+\.h\\\[0\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlal\\tv\[0-9\]+\.2s, v\[0-9\]+\.2h, v\[0-9\]+\.h\\\[6\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlsl\\tv\[0-9\]+\.2s, v\[0-9\]+\.2h, v\[0-9\]+\.h\\\[6\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlal\\tv\[0-9\]+\.4s, v\[0-9\]+\.4h, v\[0-9\]+\.h\\\[1\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlsl\\tv\[0-9\]+\.4s, v\[0-9\]+\.4h, v\[0-9\]+\.h\\\[1\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlal\\tv\[0-9\]+\.4s, v\[0-9\]+\.4h, v\[0-9\]+\.h\\\[7\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlsl\\tv\[0-9\]+\.4s, v\[0-9\]+\.4h, v\[0-9\]+\.h\\\[7\\\]" 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/fp16_fmul_lane_low_3.c b/gcc/testsuite/gcc.target/aarch64/fp16_fmul_lane_low_3.c
new file mode 100644
index 0000000000000000000000000000000000000000..a06c42373709d845de6dad14554b6aefbd110b03
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fp16_fmul_lane_low_3.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-march=armv8.4-a+fp16" } */
+
+#include "fp16_fmul_lane_low.h"
+
+/* { dg-final { scan-assembler-times "fmlal\\tv\[0-9\]+\.2s, v\[0-9\]+\.2h, v\[0-9\]+\.h\\\[0\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlsl\\tv\[0-9\]+\.2s, v\[0-9\]+\.2h, v\[0-9\]+\.h\\\[0\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlal\\tv\[0-9\]+\.2s, v\[0-9\]+\.2h, v\[0-9\]+\.h\\\[6\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlsl\\tv\[0-9\]+\.2s, v\[0-9\]+\.2h, v\[0-9\]+\.h\\\[6\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlal\\tv\[0-9\]+\.4s, v\[0-9\]+\.4h, v\[0-9\]+\.h\\\[1\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlsl\\tv\[0-9\]+\.4s, v\[0-9\]+\.4h, v\[0-9\]+\.h\\\[1\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlal\\tv\[0-9\]+\.4s, v\[0-9\]+\.4h, v\[0-9\]+\.h\\\[7\\\]" 1 } } */
+/* { dg-final { scan-assembler-times "fmlsl\\tv\[0-9\]+\.4s, v\[0-9\]+\.4h, v\[0-9\]+\.h\\\[7\\\]" 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/fp16_fmul_low.h b/gcc/testsuite/gcc.target/aarch64/fp16_fmul_low.h
new file mode 100644
index 0000000000000000000000000000000000000000..778ca1c245c7343b38272e586a54927c7cd50bee
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fp16_fmul_low.h
@@ -0,0 +1,25 @@
+#include "arm_neon.h"
+
+float32x2_t
+test_vfmlal_low_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
+{
+  return vfmlal_low_u32 (r, a, b);
+}
+
+float32x4_t
+test_vfmlalq_low_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
+{
+  return vfmlalq_low_u32 (r, a, b);
+}
+
+float32x2_t
+test_vfmlsl_low_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
+{
+  return vfmlsl_low_u32 (r, a, b);
+}
+
+float32x4_t
+test_vfmlslq_low_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
+{
+  return vfmlslq_low_u32 (r, a, b);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/fp16_fmul_low_1.c b/gcc/testsuite/gcc.target/aarch64/fp16_fmul_low_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..90d611d6988cf3595f4da09ec2820abeb1364900
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fp16_fmul_low_1.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-march=armv8.2-a+fp16fml" } */
+
+#include "fp16_fmul_low.h"
+
+/* { dg-final { scan-assembler-times "fmlal\\tv\[0-9\]+\.2s, v\[0-9\]+\.2h, v\[0-9\]+\.2h" 1 } } */
+/* { dg-final { scan-assembler-times "fmlal\\tv\[0-9\]+\.4s, v\[0-9\]+\.4h, v\[0-9\]+\.4h" 1 } } */
+/* { dg-final { scan-assembler-times "fmlsl\\tv\[0-9\]+\.2s, v\[0-9\]+\.2h, v\[0-9\]+\.2h" 1 } } */
+/* { dg-final { scan-assembler-times "fmlsl\\tv\[0-9\]+\.4s, v\[0-9\]+\.4h, v\[0-9\]+\.4h" 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/fp16_fmul_low_2.c b/gcc/testsuite/gcc.target/aarch64/fp16_fmul_low_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..b83f4a1260617482af5197d3434bb5c39cd128b6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fp16_fmul_low_2.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-march=armv8.3-a+fp16fml" } */
+
+#include "fp16_fmul_low.h"
+
+/* { dg-final { scan-assembler-times "fmlal\\tv\[0-9\]+\.2s, v\[0-9\]+\.2h, v\[0-9\]+\.2h" 1 } } */
+/* { dg-final { scan-assembler-times "fmlal\\tv\[0-9\]+\.4s, v\[0-9\]+\.4h, v\[0-9\]+\.4h" 1 } } */
+/* { dg-final { scan-assembler-times "fmlsl\\tv\[0-9\]+\.2s, v\[0-9\]+\.2h, v\[0-9\]+\.2h" 1 } } */
+/* { dg-final { scan-assembler-times "fmlsl\\tv\[0-9\]+\.4s, v\[0-9\]+\.4h, v\[0-9\]+\.4h" 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/fp16_fmul_low_3.c b/gcc/testsuite/gcc.target/aarch64/fp16_fmul_low_3.c
new file mode 100644
index 0000000000000000000000000000000000000000..6b12d59b0e2ef3a5e182a8fcdc5d475617bf8a57
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fp16_fmul_low_3.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-march=armv8.4-a+fp16" } */
+
+#include "fp16_fmul_low.h"
+
+/* { dg-final { scan-assembler-times "fmlal\\tv\[0-9\]+\.2s, v\[0-9\]+\.2h, v\[0-9\]+\.2h" 1 } } */
+/* { dg-final { scan-assembler-times "fmlal\\tv\[0-9\]+\.4s, v\[0-9\]+\.4h, v\[0-9\]+\.4h" 1 } } */
+/* { dg-final { scan-assembler-times "fmlsl\\tv\[0-9\]+\.2s, v\[0-9\]+\.2h, v\[0-9\]+\.2h" 1 } } */
+/* { dg-final { scan-assembler-times "fmlsl\\tv\[0-9\]+\.4s, v\[0-9\]+\.4h, v\[0-9\]+\.4h" 1 } } */
