This pass is used to optimise assignments to the FPMR register in
aarch64.  I chose to implement this as a middle-end pass because it
mostly reuses the existing RTL PRE code within gcse.cc.

Compared to RTL PRE, the key difference in this new pass is that we
insert new writes directly to the destination hardreg, instead of
writing to a new pseudo-register and copying the result later.  This
requires changes to the analysis portion of the pass, because sets
cannot be moved before existing instructions that set, use or clobber
the hardreg, and the value becomes unavailable after any uses of
clobbers of the hardreg.

Any uses of the hardreg in debug insns will be deleted.  We could do
better than this, but for the aarch64 fpmr I don't think we emit useful
debuginfo for deleted fp8 instructions anyway (and I don't even know if
it's possible to have a debug fpmr use when entering hardreg PRE).


Compared to the first version, I've now fixed the broken debug uses, and
simplified a lot of the analysis (it turns out DF analysis already provides
cleaner versions of the checks I need).  I also fixed a couple of other minor
bugs (including one that broke the build on every target except aarch64).

The new tests pass; I haven't rerun a bootstrap or full regression test yet,
but this should be NFC except for aarch64 code that uses the fpmr register.

Is this ok for master?

gcc/ChangeLog:

        * config/aarch64/aarch64.h (HARDREG_PRE_REGNOS): New macro.
        * gcse.cc (doing_hardreg_pre_p): New global variable.
        (do_load_motion): New boolean check.
        (current_hardreg_regno): New global variable.
        (compute_local_properties): Unset transp for hardreg clobbers.
        (prune_hardreg_uses): New function.
        (want_to_gcse_p): Use different checks for hardreg PRE.
        (oprs_unchanged_p): Disable load motion for hardreg PRE pass.
        (hash_scan_set): For hardreg PRE, skip non-hardreg sets and
        check for hardreg clobbers.
        (record_last_mem_set_info): Skip for hardreg PRE.
        (compute_pre_data): Prune hardreg uses from transp bitmap.
        (pre_expr_reaches_here_p_work): Add sentence to comment.
        (insert_insn_start_basic_block): New functions.
        (pre_edge_insert): Don't add hardreg sets to predecessor block.
        (pre_delete): Use hardreg for the reaching reg.
        (reset_hardreg_debug_uses): New function.
        (pre_gcse): For hardreg PRE, reset debug uses and don't insert
        copies.
        (one_pre_gcse_pass): Disable load motion for hardreg PRE.
        (execute_hardreg_pre): New.
        (class pass_hardreg_pre): New.
        (pass_hardreg_pre::gate): New.
        (make_pass_hardreg_pre): New.
        * passes.def (pass_hardreg_pre): New pass.
        * tree-pass.h (make_pass_hardreg_pre): New.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/acle/fpmr-1.c: New test.
        * gcc.target/aarch64/acle/fpmr-2.c: New test.
        * gcc.target/aarch64/acle/fpmr-3.c: New test.
        * gcc.target/aarch64/acle/fpmr-4.c: New test.


diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 
f1251f67c74e8da8420bad2d07a11a98a7de37ff..61837a4a98744225b9d15cfbc37cc914ac48421b
 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -1652,6 +1652,10 @@ enum class aarch64_tristate_mode : int { NO, YES, MAYBE 
};
   { int (aarch64_tristate_mode::MAYBE), \
     int (aarch64_local_sme_state::ANY) }
 
+/* Zero terminated list of regnos for which hardreg PRE should be
+   applied.  */
+#define HARDREG_PRE_REGNOS { FPM_REGNUM, 0 }
+
 #endif
 
 #endif /* GCC_AARCH64_H */
diff --git a/gcc/gcse.cc b/gcc/gcse.cc
index 
31b92f30fa1ba6c519429d4b7bc55547b2d71c01..f33de3747b896950568154acbfac1817519fe748
 100644
--- a/gcc/gcse.cc
+++ b/gcc/gcse.cc
@@ -415,6 +415,17 @@ static int gcse_create_count;
 
 /* Doing code hoisting.  */
 static bool doing_code_hoisting_p = false;
+
+/* Doing hardreg_pre.  */
+static bool doing_hardreg_pre_p = false;
+
+inline bool
+do_load_motion ()
+{
+  return flag_gcse_lm && !doing_hardreg_pre_p;
+}
+
+static unsigned int current_hardreg_regno;
 
 /* For available exprs */
 static sbitmap *ae_kill;
@@ -689,14 +700,32 @@ compute_local_properties (sbitmap *transp, sbitmap *comp, 
sbitmap *antloc,
          int indx = expr->bitmap_index;
          struct gcse_occr *occr;
 
-         /* The expression is transparent in this block if it is not killed.
-            We start by assuming all are transparent [none are killed], and
-            then reset the bits for those that are.  */
+         /* In most cases, the expression is transparent in the block if it is
+            not killed.  The exception to this is during hardreg PRE, in which
+            uses of the hardreg prevent transparency but do not kill the
+            expression.
+
+            We start by assuming all expressions are transparent [none are
+            killed], and then reset the bits for those that are.  */
          if (transp)
-           compute_transp (expr->expr, indx, transp,
-                           blocks_with_calls,
-                           modify_mem_list_set,
-                           canon_modify_mem_list);
+           {
+             compute_transp (expr->expr, indx, transp,
+                             blocks_with_calls,
+                             modify_mem_list_set,
+                             canon_modify_mem_list);
+
+             if (doing_hardreg_pre_p)
+               {
+                 /* We also need to check whether the destination hardreg is
+                    set or call-clobbered in each BB.  We'll check for hardreg
+                    uses later.  */
+                 df_ref def;
+                 for (def = DF_REG_DEF_CHAIN (current_hardreg_regno);
+                      def;
+                      def = DF_REF_NEXT_REG (def))
+                   bitmap_clear_bit (transp[DF_REF_BB (def)->index], indx);
+               }
+           }
 
          /* The occurrences recorded in antic_occr are exactly those that
             we want to set to nonzero in ANTLOC.  */
@@ -728,6 +757,37 @@ compute_local_properties (sbitmap *transp, sbitmap *comp, 
sbitmap *antloc,
        }
     }
 }
+
+/* A hardreg set is not transparent in a block if there are any uses of that
+   hardreg.  This filters the results of compute_local_properties, after the
+   result of that function has been used to define the kills bitmap.
+
+   TRANSP is the destination sbitmap to be updated.
+
+   TABLE controls which hash table to look at.  */
+
+static void
+prune_hardreg_uses (sbitmap *transp, struct gcse_hash_table_d *table)
+{
+  unsigned int i;
+  gcc_assert (doing_hardreg_pre_p);
+
+  for (i = 0; i < table->size; i++)
+    {
+      struct gcse_expr *expr;
+
+      for (expr = table->table[i]; expr != NULL; expr = expr->next_same_hash)
+       {
+         int indx = expr->bitmap_index;
+         df_ref def;
+
+         for (def = DF_REG_USE_CHAIN (current_hardreg_regno);
+              def;
+              def = DF_REF_NEXT_REG (def))
+           bitmap_clear_bit (transp[DF_REF_BB (def)->index], indx);
+       }
+    }
+}
 
 /* Hash table support.  */
 
@@ -771,17 +831,24 @@ want_to_gcse_p (rtx x, machine_mode mode, HOST_WIDE_INT 
*max_distance_ptr)
      pressure, i.e., a pseudo register with REG_EQUAL to constant
      is set only once.  Failing to do so will result in IRA/reload
      spilling such constants under high register pressure instead of
-     rematerializing them.  */
+     rematerializing them.
+
+     For hardreg PRE, register pressure is not a concern, and we also want to
+     apply GCSE to simple moves.  */
 
   switch (GET_CODE (x))
     {
     case REG:
     case SUBREG:
+      return doing_hardreg_pre_p;
+
     case CALL:
       return false;
 
     CASE_CONST_ANY:
-      if (!doing_code_hoisting_p)
+      if (doing_hardreg_pre_p)
+       return true;
+      else if (!doing_code_hoisting_p)
        /* Do not PRE constants.  */
        return false;
 
@@ -911,7 +978,7 @@ oprs_unchanged_p (const_rtx x, const rtx_insn *insn, bool 
avail_p)
       }
 
     case MEM:
-      if (! flag_gcse_lm
+      if (! do_load_motion ()
          || load_killed_in_block_p (current_bb, DF_INSN_LUID (insn),
                                     x, avail_p))
        return false;
@@ -1258,8 +1325,10 @@ hash_scan_set (rtx set, rtx_insn *insn, struct 
gcse_hash_table_d *table)
          && want_to_gcse_p (XEXP (note, 0), GET_MODE (dest), NULL))
        src = XEXP (note, 0), set = gen_rtx_SET (dest, src);
 
-      /* Only record sets of pseudo-regs in the hash table.  */
-      if (regno >= FIRST_PSEUDO_REGISTER
+      /* Only record sets of pseudo-regs in the hash table, unless we're
+        currently doing hardreg switching.  */
+      if ((doing_hardreg_pre_p ? regno == current_hardreg_regno
+                                    : regno >= FIRST_PSEUDO_REGISTER)
          /* Don't GCSE something if we can't do a reg/reg copy.  */
          && can_copy_p (GET_MODE (dest))
          /* GCSE commonly inserts instruction after the insn.  We can't
@@ -1286,12 +1355,33 @@ hash_scan_set (rtx set, rtx_insn *insn, struct 
gcse_hash_table_d *table)
             able to handle code motion of insns with multiple sets.  */
          bool antic_p = (oprs_anticipatable_p (src, insn)
                          && !multiple_sets (insn));
+         if (doing_hardreg_pre_p)
+           {
+             /* An hardreg assignment is anticipatable only if the hardreg is
+                neither set nor used prior to this assignment.  */
+             auto info = reg_avail_info[current_hardreg_regno];
+             if ((info.last_bb == current_bb
+                  && info.first_set < DF_INSN_LUID (insn))
+                 || bitmap_bit_p (DF_LR_IN (current_bb),
+                                  current_hardreg_regno))
+               antic_p = false;
+           }
+
          /* An expression is not available if its operands are
             subsequently modified, including this insn.  It's also not
             available if this is a branch, because we can't insert
             a set after the branch.  */
          bool avail_p = (oprs_available_p (src, insn)
                          && ! JUMP_P (insn));
+         if (doing_hardreg_pre_p)
+           {
+             /* An hardreg assignment is only available if the hardreg is
+                not set later in the BB.  Uses of the hardreg are allowed. */
+             auto info = reg_avail_info[current_hardreg_regno];
+             if (info.last_bb == current_bb
+                 && info.last_set > DF_INSN_LUID (insn))
+               avail_p = false;
+           }
 
          insert_expr_in_table (src, GET_MODE (dest), insn, antic_p, avail_p,
                                max_distance, table);
@@ -1300,7 +1390,10 @@ hash_scan_set (rtx set, rtx_insn *insn, struct 
gcse_hash_table_d *table)
   /* In case of store we want to consider the memory value as available in
      the REG stored in that memory. This makes it possible to remove
      redundant loads from due to stores to the same location.  */
-  else if (flag_gcse_las && REG_P (src) && MEM_P (dest))
+  else if (flag_gcse_las
+          && !doing_hardreg_pre_p
+          && REG_P (src)
+          && MEM_P (dest))
     {
       unsigned int regno = REGNO (src);
       HOST_WIDE_INT max_distance = 0;
@@ -1460,7 +1553,7 @@ record_last_reg_set_info (rtx_insn *insn, int regno)
 static void
 record_last_mem_set_info (rtx_insn *insn)
 {
-  if (! flag_gcse_lm)
+  if (! do_load_motion ())
     return;
 
   record_last_mem_set_info_common (insn, modify_mem_list,
@@ -1884,6 +1977,9 @@ compute_pre_data (void)
       bitmap_not (ae_kill[bb->index], ae_kill[bb->index]);
     }
 
+  if (doing_hardreg_pre_p)
+    prune_hardreg_uses (transp, &expr_hash_table);
+
   edge_list = pre_edge_lcm (expr_hash_table.n_elems, transp, comp, antloc,
                            ae_kill, &pre_insert_map, &pre_delete_map);
   sbitmap_vector_free (antloc);
@@ -1938,7 +2034,10 @@ pre_expr_reaches_here_p_work (basic_block occr_bb, 
struct gcse_expr *expr,
 
          visited[pred_bb->index] = 1;
        }
-      /* Ignore this predecessor if it kills the expression.  */
+      /* Ignore this predecessor if it kills the expression.
+
+        If this were used for hardreg pre, then it would need to use the kills
+        bitmap.  */
       else if (! bitmap_bit_p (transp[pred_bb->index], expr->bitmap_index))
        visited[pred_bb->index] = 1;
 
@@ -2109,6 +2208,59 @@ insert_insn_end_basic_block (struct gcse_expr *expr, 
basic_block bb)
     }
 }
 
+/* Return the INSN which is added at the start of the block BB with
+   same instruction pattern with PAT.  */
+
+rtx_insn *
+insert_insn_start_basic_block (rtx_insn *pat, basic_block bb)
+{
+  rtx_insn *insn = BB_HEAD (bb);
+  rtx_insn *next_insn;
+
+  gcc_assert (pat && INSN_P (pat));
+
+  /* Insert after the last initial CODE_LABEL or NOTE_INSN_BASIC_BLOCK, before
+     any other instructions.  */
+  while ((next_insn = NEXT_INSN (insn))
+        && (LABEL_P (next_insn) || NOTE_INSN_BASIC_BLOCK_P (insn)))
+    insn = next_insn;
+
+  rtx_insn *new_insn = emit_insn_after_noloc (pat, insn, bb);
+
+  while (pat != NULL_RTX)
+    {
+      if (INSN_P (pat))
+       add_label_notes (PATTERN (pat), new_insn);
+      pat = NEXT_INSN (pat);
+    }
+
+  return new_insn;
+}
+
+/* Add EXPR to the start of basic block BB.
+
+   This is used by hardreg PRE.  */
+
+static void
+insert_insn_start_basic_block (struct gcse_expr *expr, basic_block bb)
+{
+  rtx reg = expr->reaching_reg;
+  int regno = REGNO (reg);
+
+  rtx_insn *insn = process_insert_insn (expr);
+  rtx_insn *new_insn = insert_insn_start_basic_block (insn, bb);
+
+  gcse_create_count++;
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "hardreg PRE: start of bb %d, insn %d, ",
+              bb->index, INSN_UID (new_insn));
+      fprintf (dump_file, "copying expression %d to reg %d\n",
+              expr->bitmap_index, regno);
+    }
+}
+
 /* Insert partially redundant expressions on edges in the CFG to make
    the expressions fully redundant.  */
 
@@ -2130,7 +2282,8 @@ pre_edge_insert (struct edge_list *edge_list, struct 
gcse_expr **index_map)
   for (e = 0; e < num_edges; e++)
     {
       int indx;
-      basic_block bb = INDEX_EDGE_PRED_BB (edge_list, e);
+      basic_block pred_bb = INDEX_EDGE_PRED_BB (edge_list, e);
+      basic_block succ_bb = INDEX_EDGE_SUCC_BB (edge_list, e);
 
       for (i = indx = 0; i < set_size; i++, indx += SBITMAP_ELT_BITS)
        {
@@ -2159,13 +2312,24 @@ pre_edge_insert (struct edge_list *edge_list, struct 
gcse_expr **index_map)
 
                        /* We can't insert anything on an abnormal and
                           critical edge, so we insert the insn at the end of
-                          the previous block. There are several alternatives
+                          the previous block.  There are several alternatives
                           detailed in Morgans book P277 (sec 10.5) for
                           handling this situation.  This one is easiest for
-                          now.  */
+                          now.
 
+                          For hardreg PRE  this would add an unwanted clobber
+                          of the hardreg, so we instead insert in the
+                          successor block. This may be partially redundant,
+                          but it is at least correct.  */
                        if (eg->flags & EDGE_ABNORMAL)
-                         insert_insn_end_basic_block (index_map[j], bb);
+                         {
+                           if (doing_hardreg_pre_p)
+                             insert_insn_start_basic_block (index_map[j],
+                                                            succ_bb);
+                           else
+                             insert_insn_end_basic_block (index_map[j],
+                                                          pred_bb);
+                         }
                        else
                          {
                            insn = process_insert_insn (index_map[j]);
@@ -2175,8 +2339,8 @@ pre_edge_insert (struct edge_list *edge_list, struct 
gcse_expr **index_map)
                        if (dump_file)
                          {
                            fprintf (dump_file, "PRE: edge (%d,%d), ",
-                                    bb->index,
-                                    INDEX_EDGE_SUCC_BB (edge_list, e)->index);
+                                    pred_bb->index,
+                                    succ_bb->index);
                            fprintf (dump_file, "copy expression %d\n",
                                     expr->bitmap_index);
                          }
@@ -2491,13 +2655,25 @@ pre_delete (void)
                && (set = single_set (insn)) != 0
                 && dbg_cnt (pre_insn))
              {
-               /* Create a pseudo-reg to store the result of reaching
-                  expressions into.  Get the mode for the new pseudo from
-                  the mode of the original destination pseudo.  */
+               rtx dest = SET_DEST (set);
                if (expr->reaching_reg == NULL)
-                 expr->reaching_reg = gen_reg_rtx_and_attrs (SET_DEST (set));
+                 {
+                   if (doing_hardreg_pre_p)
+                     /* Use the hardreg as the reaching register.  The
+                        deleted sets will be replaced with noop moves.
+
+                        This may change the value of the hardreg in some debug
+                        instructions, so we will need to reset any debug uses
+                        of the hardreg.  */
+                     expr->reaching_reg = dest;
+                   else
+                     /* Create a pseudo-reg to store the result of reaching
+                        expressions into.  Get the mode for the new pseudo from
+                        the mode of the original destination pseudo.  */
+                     expr->reaching_reg = gen_reg_rtx_and_attrs (SET_DEST 
(set));
+                 }
 
-               gcse_emit_move_after (SET_DEST (set), expr->reaching_reg, insn);
+               gcse_emit_move_after (dest, expr->reaching_reg, insn);
                delete_insn (insn);
                occr->deleted_p = 1;
                changed = true;
@@ -2518,6 +2694,25 @@ pre_delete (void)
   return changed;
 }
 
+/* Since hardreg PRE reuses the hardreg as the reaching register, we need to
+   eliminate any existing uses in debug insns.  This is overly conservative,
+   but there's currently no benefit to preserving the debug insns, so there's
+   no point doing the work to retain them.  */
+
+static void
+reset_hardreg_debug_uses ()
+{
+  df_ref def;
+  for (def = DF_REG_USE_CHAIN (current_hardreg_regno);
+       def;
+       def = DF_REF_NEXT_REG (def))
+    {
+      rtx_insn *insn = DF_REF_INSN (def);
+      if (DEBUG_INSN_P (insn))
+       delete_insn (insn);
+    }
+}
+
 /* Perform GCSE optimizations using PRE.
    This is called by one_pre_gcse_pass after all the dataflow analysis
    has been done.
@@ -2561,12 +2756,16 @@ pre_gcse (struct edge_list *edge_list)
 
   changed = pre_delete ();
   did_insert = pre_edge_insert (edge_list, index_map);
-
   /* In other places with reaching expressions, copy the expression to the
-     specially allocated pseudo-reg that reaches the redundant expr.  */
-  pre_insert_copies ();
+     specially allocated pseudo-reg that reaches the redundant expr.  This
+     isn't needed for hardreg PRE.  */
+  if (!doing_hardreg_pre_p)
+    pre_insert_copies ();
+
   if (did_insert)
     {
+      if (doing_hardreg_pre_p)
+       reset_hardreg_debug_uses ();
       commit_edge_insertions ();
       changed = true;
     }
@@ -2601,11 +2800,11 @@ one_pre_gcse_pass (void)
 
   alloc_hash_table (&expr_hash_table);
   add_noreturn_fake_exit_edges ();
-  if (flag_gcse_lm)
+  if (do_load_motion ())
     compute_ld_motion_mems ();
 
   compute_hash_table (&expr_hash_table);
-  if (flag_gcse_lm)
+  if (do_load_motion ())
     trim_ld_motion_mems ();
   if (dump_file)
     dump_hash_table (dump_file, "Expression", &expr_hash_table);
@@ -2621,7 +2820,7 @@ one_pre_gcse_pass (void)
       free_pre_mem ();
     }
 
-  if (flag_gcse_lm)
+  if (do_load_motion ())
     free_ld_motion_mems ();
   remove_fake_exit_edges ();
   free_hash_table (&expr_hash_table);
@@ -4028,6 +4227,32 @@ execute_rtl_pre (void)
   return 0;
 }
 
+static unsigned int
+execute_hardreg_pre (void)
+{
+#ifdef HARDREG_PRE_REGNOS
+  doing_hardreg_pre_p = true;
+  unsigned int regnos[] = HARDREG_PRE_REGNOS;
+  /* It's possible to avoid this loop, but it isn't worth doing so until
+     hardreg PRE is used for multiple hardregs.  */
+  for (int i = 0; regnos[i] != 0; i++)
+    {
+      int changed;
+      current_hardreg_regno = regnos[i];
+      if (dump_file)
+       fprintf(dump_file, "Entering hardreg PRE for regno %d\n",
+               current_hardreg_regno);
+      delete_unreachable_blocks ();
+      df_analyze ();
+      changed = one_pre_gcse_pass ();
+      if (changed)
+       cleanup_cfg (0);
+    }
+  doing_hardreg_pre_p = false;
+#endif
+  return 0;
+}
+
 static unsigned int
 execute_rtl_hoist (void)
 {
@@ -4096,6 +4321,56 @@ make_pass_rtl_pre (gcc::context *ctxt)
 
 namespace {
 
+const pass_data pass_data_hardreg_pre =
+{
+  RTL_PASS, /* type */
+  "hardreg_pre", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_PRE, /* tv_id */
+  PROP_cfglayout, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_df_finish, /* todo_flags_finish */
+};
+
+class pass_hardreg_pre : public rtl_opt_pass
+{
+public:
+  pass_hardreg_pre (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_hardreg_pre, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  bool gate (function *) final override;
+  unsigned int execute (function *)  final override
+  {
+    return execute_hardreg_pre ();
+  }
+
+}; // class pass_rtl_pre
+
+bool
+pass_hardreg_pre::gate (function *fun)
+{
+#ifdef HARDREG_PRE_REGNOS
+  return optimize > 0
+    && !fun->calls_setjmp;
+#else
+  return false;
+#endif
+}
+
+} // anon namespace
+
+rtl_opt_pass *
+make_pass_hardreg_pre (gcc::context *ctxt)
+{
+  return new pass_hardreg_pre (ctxt);
+}
+
+namespace {
+
 const pass_data pass_data_rtl_hoist =
 {
   RTL_PASS, /* type */
diff --git a/gcc/passes.def b/gcc/passes.def
index 
ae85ae72dff734a8698f606254970437e2bf93a5..95d72b22761eec3668a4d5bbcaa8e41fcc4d830a
 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -463,6 +463,7 @@ along with GCC; see the file COPYING3.  If not see
       NEXT_PASS (pass_rtl_cprop);
       NEXT_PASS (pass_rtl_pre);
       NEXT_PASS (pass_rtl_hoist);
+      NEXT_PASS (pass_hardreg_pre);
       NEXT_PASS (pass_rtl_cprop);
       NEXT_PASS (pass_rtl_store_motion);
       NEXT_PASS (pass_cse_after_global_opts);
diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fpmr-1.c 
b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-1.c
new file mode 100644
index 
0000000000000000000000000000000000000000..f7a47f81c5ea4639827d4c902f316932120f44af
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-1.c
@@ -0,0 +1,58 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -march=armv8-a+fp8fma" } */
+
+#include <arm_neon.h>
+
+void foo(float16_t ap[20000], mfloat8x16_t b, mfloat8x16_t c, int br)
+{
+  float16x8_t a;
+  a = vld1q_f16(ap);
+  a = vmlalbq_f16_mf8_fpm(a, b, c, 13);
+  vst1q_f16(ap, a);
+  if (br)
+    {
+      a = vld1q_f16(ap + 8);
+      a = vmlalbq_f16_mf8_fpm(a, b, c, 13);
+      vst1q_f16(ap + 8, a);
+      a = vld1q_f16(ap + 16);
+      a = vmlalbq_f16_mf8_fpm(a, b, c, 13);
+      vst1q_f16(ap + 16, a);
+    }
+  else
+    {
+      a = vld1q_f16(ap + 24);
+      a = vmlalbq_f16_mf8_fpm(a, b, c, 13);
+      vst1q_f16(ap + 24, a);
+    }
+  a = vld1q_f16(ap + 32);
+  a = vmlalbq_f16_mf8_fpm(a, b, c, 13);
+  vst1q_f16(ap + 32, a);
+}
+
+void bar(float16_t ap[20000], mfloat8x16_t b, mfloat8x16_t c, fpm_t mode, int 
br)
+{
+  float16x8_t a;
+  a = vld1q_f16(ap);
+  a = vmlalbq_f16_mf8_fpm(a, b, c, mode);
+  vst1q_f16(ap, a);
+  if (br)
+    {
+      a = vld1q_f16(ap + 8);
+      a = vmlalbq_f16_mf8_fpm(a, b, c, mode);
+      vst1q_f16(ap + 8, a);
+      a = vld1q_f16(ap + 16);
+      a = vmlalbq_f16_mf8_fpm(a, b, c, mode);
+      vst1q_f16(ap + 16, a);
+    }
+  else
+    {
+      a = vld1q_f16(ap + 24);
+      a = vmlalbq_f16_mf8_fpm(a, b, c, mode);
+      vst1q_f16(ap + 24, a);
+    }
+  a = vld1q_f16(ap + 32);
+  a = vmlalbq_f16_mf8_fpm(a, b, c, mode);
+  vst1q_f16(ap + 32, a);
+}
+
+/* { dg-final { scan-assembler-times "msr\tfpmr" 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fpmr-2.c 
b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-2.c
new file mode 100644
index 
0000000000000000000000000000000000000000..c5b255b0a9a8ea9161217b22f19adaf58c899dbb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-2.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -march=armv8-a+fp8fma" } */
+
+#include <arm_neon.h>
+
+void foo(float16_t ap[20000], mfloat8x16_t b, mfloat8x16_t c)
+{
+  for (int i = 0; i < 103; i++)
+    {
+      float16x8_t a = vld1q_f16(ap + 8*i);
+      a = vmlalbq_f16_mf8_fpm(a, b, c, 13);
+      vst1q_f16(ap + 8*i, a);
+    }
+}
+/* { dg-final { scan-assembler "msr\tfpmr.*\n\.L2" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fpmr-3.c 
b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-3.c
new file mode 100644
index 
0000000000000000000000000000000000000000..73a79ad4b44e2b950cf7ea3e914254b5fdc05b69
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-3.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -march=armv8-a+fp8fma" } */
+
+#include <arm_neon.h>
+
+void foo(float16_t ap[20000], mfloat8x16_t b, mfloat8x16_t c, fpm_t mode)
+{
+  float16x8_t x = vld1q_f16(ap + 1);
+  x = vmlalbq_f16_mf8_fpm(x, b, c, mode);
+  vst1q_f16(ap + 1, x);
+  for (int i = 0; i < 103; i++)
+    {
+      float16x8_t a = vld1q_f16(ap + 8*i);
+      a = vmlalbq_f16_mf8_fpm(a, b, c, mode);
+      vst1q_f16(ap + 8*i, a);
+    }
+}
+/* { dg-final { scan-assembler-times "msr\tfpmr" 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fpmr-4.c 
b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-4.c
new file mode 100644
index 
0000000000000000000000000000000000000000..18c1def752f557e98868250cd73442fb9f556e18
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-4.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -march=armv8-a+fp8fma" } */
+
+#include <arm_neon.h>
+
+void baz(float16_t ap[20000], mfloat8x16_t b, mfloat8x16_t c)
+{
+  float16x8_t x = vld1q_f16(ap + 1);
+  x = vmlalbq_f16_mf8_fpm(x, b, c, 13);
+  vst1q_f16(ap + 1, x);
+  for (int i = 0; i < 10; i++)
+    {
+      float16x8_t a = vld1q_f16(ap + 16*i);
+      a = vmlalbq_f16_mf8_fpm(a, b, c, 13);
+      vst1q_f16(ap + 16*i, a);
+      a = vld1q_f16(ap + 16*i + 8);
+      a = vmlalbq_f16_mf8_fpm(a, b, c, 865);
+      vst1q_f16(ap + 16*i+8, a);
+    }
+}
+
+/* { dg-final { scan-assembler-times "msr\tfpmr" 3 } } */
+/* { dg-final { scan-assembler "msr\tfpmr.*\n\tb\t" } } */
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index 
ce463629194a7298b70da6463706caea0b28dabd..797d719b2c45ffa2d71c7e94687bf1d5ac19c69f
 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -573,6 +573,7 @@ extern rtl_opt_pass *make_pass_rtl_dse3 (gcc::context 
*ctxt);
 extern rtl_opt_pass *make_pass_rtl_cprop (gcc::context *ctxt);
 extern rtl_opt_pass *make_pass_rtl_pre (gcc::context *ctxt);
 extern rtl_opt_pass *make_pass_rtl_hoist (gcc::context *ctxt);
+extern rtl_opt_pass *make_pass_hardreg_pre (gcc::context *ctxt);
 extern rtl_opt_pass *make_pass_rtl_avoid_store_forwarding (gcc::context *ctxt);
 extern rtl_opt_pass *make_pass_rtl_store_motion (gcc::context *ctxt);
 extern rtl_opt_pass *make_pass_cse_after_global_opts (gcc::context *ctxt);

Reply via email to