This pass is used to optimise assignments to the FPMR register in aarch64. I chose to implement this as a middle-end pass because it mostly reuses the existing RTL PRE code within gcse.cc.
Compared to RTL PRE, the key difference in this new pass is that we insert new writes directly to the destination hardreg, instead of writing to a new pseudo-register and copying the result later. This requires changes to the analysis portion of the pass, because sets cannot be moved before existing instructions that set, use or clobber the hardreg, and the value becomes unavailable after any uses of clobbers of the hardreg. Any uses of the hardreg in debug insns will be deleted. We could do better than this, but for the aarch64 fpmr I don't think we emit useful debuginfo for deleted fp8 instructions anyway (and I don't even know if it's possible to have a debug fpmr use when entering hardreg PRE). Compared to the first version, I've now fixed the broken debug uses, and simplified a lot of the analysis (it turns out DF analysis already provides cleaner versions of the checks I need). I also fixed a couple of other minor bugs (including one that broke the build on every target except aarch64). The new tests pass; I haven't rerun a bootstrap or full regression test yet, but this should be NFC except for aarch64 code that uses the fpmr register. Is this ok for master? gcc/ChangeLog: * config/aarch64/aarch64.h (HARDREG_PRE_REGNOS): New macro. * gcse.cc (doing_hardreg_pre_p): New global variable. (do_load_motion): New boolean check. (current_hardreg_regno): New global variable. (compute_local_properties): Unset transp for hardreg clobbers. (prune_hardreg_uses): New function. (want_to_gcse_p): Use different checks for hardreg PRE. (oprs_unchanged_p): Disable load motion for hardreg PRE pass. (hash_scan_set): For hardreg PRE, skip non-hardreg sets and check for hardreg clobbers. (record_last_mem_set_info): Skip for hardreg PRE. (compute_pre_data): Prune hardreg uses from transp bitmap. (pre_expr_reaches_here_p_work): Add sentence to comment. (insert_insn_start_basic_block): New functions. (pre_edge_insert): Don't add hardreg sets to predecessor block. (pre_delete): Use hardreg for the reaching reg. (reset_hardreg_debug_uses): New function. (pre_gcse): For hardreg PRE, reset debug uses and don't insert copies. (one_pre_gcse_pass): Disable load motion for hardreg PRE. (execute_hardreg_pre): New. (class pass_hardreg_pre): New. (pass_hardreg_pre::gate): New. (make_pass_hardreg_pre): New. * passes.def (pass_hardreg_pre): New pass. * tree-pass.h (make_pass_hardreg_pre): New. gcc/testsuite/ChangeLog: * gcc.target/aarch64/acle/fpmr-1.c: New test. * gcc.target/aarch64/acle/fpmr-2.c: New test. * gcc.target/aarch64/acle/fpmr-3.c: New test. * gcc.target/aarch64/acle/fpmr-4.c: New test. diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index f1251f67c74e8da8420bad2d07a11a98a7de37ff..61837a4a98744225b9d15cfbc37cc914ac48421b 100644 --- a/gcc/config/aarch64/aarch64.h +++ b/gcc/config/aarch64/aarch64.h @@ -1652,6 +1652,10 @@ enum class aarch64_tristate_mode : int { NO, YES, MAYBE }; { int (aarch64_tristate_mode::MAYBE), \ int (aarch64_local_sme_state::ANY) } +/* Zero terminated list of regnos for which hardreg PRE should be + applied. */ +#define HARDREG_PRE_REGNOS { FPM_REGNUM, 0 } + #endif #endif /* GCC_AARCH64_H */ diff --git a/gcc/gcse.cc b/gcc/gcse.cc index 31b92f30fa1ba6c519429d4b7bc55547b2d71c01..f33de3747b896950568154acbfac1817519fe748 100644 --- a/gcc/gcse.cc +++ b/gcc/gcse.cc @@ -415,6 +415,17 @@ static int gcse_create_count; /* Doing code hoisting. */ static bool doing_code_hoisting_p = false; + +/* Doing hardreg_pre. */ +static bool doing_hardreg_pre_p = false; + +inline bool +do_load_motion () +{ + return flag_gcse_lm && !doing_hardreg_pre_p; +} + +static unsigned int current_hardreg_regno; /* For available exprs */ static sbitmap *ae_kill; @@ -689,14 +700,32 @@ compute_local_properties (sbitmap *transp, sbitmap *comp, sbitmap *antloc, int indx = expr->bitmap_index; struct gcse_occr *occr; - /* The expression is transparent in this block if it is not killed. - We start by assuming all are transparent [none are killed], and - then reset the bits for those that are. */ + /* In most cases, the expression is transparent in the block if it is + not killed. The exception to this is during hardreg PRE, in which + uses of the hardreg prevent transparency but do not kill the + expression. + + We start by assuming all expressions are transparent [none are + killed], and then reset the bits for those that are. */ if (transp) - compute_transp (expr->expr, indx, transp, - blocks_with_calls, - modify_mem_list_set, - canon_modify_mem_list); + { + compute_transp (expr->expr, indx, transp, + blocks_with_calls, + modify_mem_list_set, + canon_modify_mem_list); + + if (doing_hardreg_pre_p) + { + /* We also need to check whether the destination hardreg is + set or call-clobbered in each BB. We'll check for hardreg + uses later. */ + df_ref def; + for (def = DF_REG_DEF_CHAIN (current_hardreg_regno); + def; + def = DF_REF_NEXT_REG (def)) + bitmap_clear_bit (transp[DF_REF_BB (def)->index], indx); + } + } /* The occurrences recorded in antic_occr are exactly those that we want to set to nonzero in ANTLOC. */ @@ -728,6 +757,37 @@ compute_local_properties (sbitmap *transp, sbitmap *comp, sbitmap *antloc, } } } + +/* A hardreg set is not transparent in a block if there are any uses of that + hardreg. This filters the results of compute_local_properties, after the + result of that function has been used to define the kills bitmap. + + TRANSP is the destination sbitmap to be updated. + + TABLE controls which hash table to look at. */ + +static void +prune_hardreg_uses (sbitmap *transp, struct gcse_hash_table_d *table) +{ + unsigned int i; + gcc_assert (doing_hardreg_pre_p); + + for (i = 0; i < table->size; i++) + { + struct gcse_expr *expr; + + for (expr = table->table[i]; expr != NULL; expr = expr->next_same_hash) + { + int indx = expr->bitmap_index; + df_ref def; + + for (def = DF_REG_USE_CHAIN (current_hardreg_regno); + def; + def = DF_REF_NEXT_REG (def)) + bitmap_clear_bit (transp[DF_REF_BB (def)->index], indx); + } + } +} /* Hash table support. */ @@ -771,17 +831,24 @@ want_to_gcse_p (rtx x, machine_mode mode, HOST_WIDE_INT *max_distance_ptr) pressure, i.e., a pseudo register with REG_EQUAL to constant is set only once. Failing to do so will result in IRA/reload spilling such constants under high register pressure instead of - rematerializing them. */ + rematerializing them. + + For hardreg PRE, register pressure is not a concern, and we also want to + apply GCSE to simple moves. */ switch (GET_CODE (x)) { case REG: case SUBREG: + return doing_hardreg_pre_p; + case CALL: return false; CASE_CONST_ANY: - if (!doing_code_hoisting_p) + if (doing_hardreg_pre_p) + return true; + else if (!doing_code_hoisting_p) /* Do not PRE constants. */ return false; @@ -911,7 +978,7 @@ oprs_unchanged_p (const_rtx x, const rtx_insn *insn, bool avail_p) } case MEM: - if (! flag_gcse_lm + if (! do_load_motion () || load_killed_in_block_p (current_bb, DF_INSN_LUID (insn), x, avail_p)) return false; @@ -1258,8 +1325,10 @@ hash_scan_set (rtx set, rtx_insn *insn, struct gcse_hash_table_d *table) && want_to_gcse_p (XEXP (note, 0), GET_MODE (dest), NULL)) src = XEXP (note, 0), set = gen_rtx_SET (dest, src); - /* Only record sets of pseudo-regs in the hash table. */ - if (regno >= FIRST_PSEUDO_REGISTER + /* Only record sets of pseudo-regs in the hash table, unless we're + currently doing hardreg switching. */ + if ((doing_hardreg_pre_p ? regno == current_hardreg_regno + : regno >= FIRST_PSEUDO_REGISTER) /* Don't GCSE something if we can't do a reg/reg copy. */ && can_copy_p (GET_MODE (dest)) /* GCSE commonly inserts instruction after the insn. We can't @@ -1286,12 +1355,33 @@ hash_scan_set (rtx set, rtx_insn *insn, struct gcse_hash_table_d *table) able to handle code motion of insns with multiple sets. */ bool antic_p = (oprs_anticipatable_p (src, insn) && !multiple_sets (insn)); + if (doing_hardreg_pre_p) + { + /* An hardreg assignment is anticipatable only if the hardreg is + neither set nor used prior to this assignment. */ + auto info = reg_avail_info[current_hardreg_regno]; + if ((info.last_bb == current_bb + && info.first_set < DF_INSN_LUID (insn)) + || bitmap_bit_p (DF_LR_IN (current_bb), + current_hardreg_regno)) + antic_p = false; + } + /* An expression is not available if its operands are subsequently modified, including this insn. It's also not available if this is a branch, because we can't insert a set after the branch. */ bool avail_p = (oprs_available_p (src, insn) && ! JUMP_P (insn)); + if (doing_hardreg_pre_p) + { + /* An hardreg assignment is only available if the hardreg is + not set later in the BB. Uses of the hardreg are allowed. */ + auto info = reg_avail_info[current_hardreg_regno]; + if (info.last_bb == current_bb + && info.last_set > DF_INSN_LUID (insn)) + avail_p = false; + } insert_expr_in_table (src, GET_MODE (dest), insn, antic_p, avail_p, max_distance, table); @@ -1300,7 +1390,10 @@ hash_scan_set (rtx set, rtx_insn *insn, struct gcse_hash_table_d *table) /* In case of store we want to consider the memory value as available in the REG stored in that memory. This makes it possible to remove redundant loads from due to stores to the same location. */ - else if (flag_gcse_las && REG_P (src) && MEM_P (dest)) + else if (flag_gcse_las + && !doing_hardreg_pre_p + && REG_P (src) + && MEM_P (dest)) { unsigned int regno = REGNO (src); HOST_WIDE_INT max_distance = 0; @@ -1460,7 +1553,7 @@ record_last_reg_set_info (rtx_insn *insn, int regno) static void record_last_mem_set_info (rtx_insn *insn) { - if (! flag_gcse_lm) + if (! do_load_motion ()) return; record_last_mem_set_info_common (insn, modify_mem_list, @@ -1884,6 +1977,9 @@ compute_pre_data (void) bitmap_not (ae_kill[bb->index], ae_kill[bb->index]); } + if (doing_hardreg_pre_p) + prune_hardreg_uses (transp, &expr_hash_table); + edge_list = pre_edge_lcm (expr_hash_table.n_elems, transp, comp, antloc, ae_kill, &pre_insert_map, &pre_delete_map); sbitmap_vector_free (antloc); @@ -1938,7 +2034,10 @@ pre_expr_reaches_here_p_work (basic_block occr_bb, struct gcse_expr *expr, visited[pred_bb->index] = 1; } - /* Ignore this predecessor if it kills the expression. */ + /* Ignore this predecessor if it kills the expression. + + If this were used for hardreg pre, then it would need to use the kills + bitmap. */ else if (! bitmap_bit_p (transp[pred_bb->index], expr->bitmap_index)) visited[pred_bb->index] = 1; @@ -2109,6 +2208,59 @@ insert_insn_end_basic_block (struct gcse_expr *expr, basic_block bb) } } +/* Return the INSN which is added at the start of the block BB with + same instruction pattern with PAT. */ + +rtx_insn * +insert_insn_start_basic_block (rtx_insn *pat, basic_block bb) +{ + rtx_insn *insn = BB_HEAD (bb); + rtx_insn *next_insn; + + gcc_assert (pat && INSN_P (pat)); + + /* Insert after the last initial CODE_LABEL or NOTE_INSN_BASIC_BLOCK, before + any other instructions. */ + while ((next_insn = NEXT_INSN (insn)) + && (LABEL_P (next_insn) || NOTE_INSN_BASIC_BLOCK_P (insn))) + insn = next_insn; + + rtx_insn *new_insn = emit_insn_after_noloc (pat, insn, bb); + + while (pat != NULL_RTX) + { + if (INSN_P (pat)) + add_label_notes (PATTERN (pat), new_insn); + pat = NEXT_INSN (pat); + } + + return new_insn; +} + +/* Add EXPR to the start of basic block BB. + + This is used by hardreg PRE. */ + +static void +insert_insn_start_basic_block (struct gcse_expr *expr, basic_block bb) +{ + rtx reg = expr->reaching_reg; + int regno = REGNO (reg); + + rtx_insn *insn = process_insert_insn (expr); + rtx_insn *new_insn = insert_insn_start_basic_block (insn, bb); + + gcse_create_count++; + + if (dump_file) + { + fprintf (dump_file, "hardreg PRE: start of bb %d, insn %d, ", + bb->index, INSN_UID (new_insn)); + fprintf (dump_file, "copying expression %d to reg %d\n", + expr->bitmap_index, regno); + } +} + /* Insert partially redundant expressions on edges in the CFG to make the expressions fully redundant. */ @@ -2130,7 +2282,8 @@ pre_edge_insert (struct edge_list *edge_list, struct gcse_expr **index_map) for (e = 0; e < num_edges; e++) { int indx; - basic_block bb = INDEX_EDGE_PRED_BB (edge_list, e); + basic_block pred_bb = INDEX_EDGE_PRED_BB (edge_list, e); + basic_block succ_bb = INDEX_EDGE_SUCC_BB (edge_list, e); for (i = indx = 0; i < set_size; i++, indx += SBITMAP_ELT_BITS) { @@ -2159,13 +2312,24 @@ pre_edge_insert (struct edge_list *edge_list, struct gcse_expr **index_map) /* We can't insert anything on an abnormal and critical edge, so we insert the insn at the end of - the previous block. There are several alternatives + the previous block. There are several alternatives detailed in Morgans book P277 (sec 10.5) for handling this situation. This one is easiest for - now. */ + now. + For hardreg PRE this would add an unwanted clobber + of the hardreg, so we instead insert in the + successor block. This may be partially redundant, + but it is at least correct. */ if (eg->flags & EDGE_ABNORMAL) - insert_insn_end_basic_block (index_map[j], bb); + { + if (doing_hardreg_pre_p) + insert_insn_start_basic_block (index_map[j], + succ_bb); + else + insert_insn_end_basic_block (index_map[j], + pred_bb); + } else { insn = process_insert_insn (index_map[j]); @@ -2175,8 +2339,8 @@ pre_edge_insert (struct edge_list *edge_list, struct gcse_expr **index_map) if (dump_file) { fprintf (dump_file, "PRE: edge (%d,%d), ", - bb->index, - INDEX_EDGE_SUCC_BB (edge_list, e)->index); + pred_bb->index, + succ_bb->index); fprintf (dump_file, "copy expression %d\n", expr->bitmap_index); } @@ -2491,13 +2655,25 @@ pre_delete (void) && (set = single_set (insn)) != 0 && dbg_cnt (pre_insn)) { - /* Create a pseudo-reg to store the result of reaching - expressions into. Get the mode for the new pseudo from - the mode of the original destination pseudo. */ + rtx dest = SET_DEST (set); if (expr->reaching_reg == NULL) - expr->reaching_reg = gen_reg_rtx_and_attrs (SET_DEST (set)); + { + if (doing_hardreg_pre_p) + /* Use the hardreg as the reaching register. The + deleted sets will be replaced with noop moves. + + This may change the value of the hardreg in some debug + instructions, so we will need to reset any debug uses + of the hardreg. */ + expr->reaching_reg = dest; + else + /* Create a pseudo-reg to store the result of reaching + expressions into. Get the mode for the new pseudo from + the mode of the original destination pseudo. */ + expr->reaching_reg = gen_reg_rtx_and_attrs (SET_DEST (set)); + } - gcse_emit_move_after (SET_DEST (set), expr->reaching_reg, insn); + gcse_emit_move_after (dest, expr->reaching_reg, insn); delete_insn (insn); occr->deleted_p = 1; changed = true; @@ -2518,6 +2694,25 @@ pre_delete (void) return changed; } +/* Since hardreg PRE reuses the hardreg as the reaching register, we need to + eliminate any existing uses in debug insns. This is overly conservative, + but there's currently no benefit to preserving the debug insns, so there's + no point doing the work to retain them. */ + +static void +reset_hardreg_debug_uses () +{ + df_ref def; + for (def = DF_REG_USE_CHAIN (current_hardreg_regno); + def; + def = DF_REF_NEXT_REG (def)) + { + rtx_insn *insn = DF_REF_INSN (def); + if (DEBUG_INSN_P (insn)) + delete_insn (insn); + } +} + /* Perform GCSE optimizations using PRE. This is called by one_pre_gcse_pass after all the dataflow analysis has been done. @@ -2561,12 +2756,16 @@ pre_gcse (struct edge_list *edge_list) changed = pre_delete (); did_insert = pre_edge_insert (edge_list, index_map); - /* In other places with reaching expressions, copy the expression to the - specially allocated pseudo-reg that reaches the redundant expr. */ - pre_insert_copies (); + specially allocated pseudo-reg that reaches the redundant expr. This + isn't needed for hardreg PRE. */ + if (!doing_hardreg_pre_p) + pre_insert_copies (); + if (did_insert) { + if (doing_hardreg_pre_p) + reset_hardreg_debug_uses (); commit_edge_insertions (); changed = true; } @@ -2601,11 +2800,11 @@ one_pre_gcse_pass (void) alloc_hash_table (&expr_hash_table); add_noreturn_fake_exit_edges (); - if (flag_gcse_lm) + if (do_load_motion ()) compute_ld_motion_mems (); compute_hash_table (&expr_hash_table); - if (flag_gcse_lm) + if (do_load_motion ()) trim_ld_motion_mems (); if (dump_file) dump_hash_table (dump_file, "Expression", &expr_hash_table); @@ -2621,7 +2820,7 @@ one_pre_gcse_pass (void) free_pre_mem (); } - if (flag_gcse_lm) + if (do_load_motion ()) free_ld_motion_mems (); remove_fake_exit_edges (); free_hash_table (&expr_hash_table); @@ -4028,6 +4227,32 @@ execute_rtl_pre (void) return 0; } +static unsigned int +execute_hardreg_pre (void) +{ +#ifdef HARDREG_PRE_REGNOS + doing_hardreg_pre_p = true; + unsigned int regnos[] = HARDREG_PRE_REGNOS; + /* It's possible to avoid this loop, but it isn't worth doing so until + hardreg PRE is used for multiple hardregs. */ + for (int i = 0; regnos[i] != 0; i++) + { + int changed; + current_hardreg_regno = regnos[i]; + if (dump_file) + fprintf(dump_file, "Entering hardreg PRE for regno %d\n", + current_hardreg_regno); + delete_unreachable_blocks (); + df_analyze (); + changed = one_pre_gcse_pass (); + if (changed) + cleanup_cfg (0); + } + doing_hardreg_pre_p = false; +#endif + return 0; +} + static unsigned int execute_rtl_hoist (void) { @@ -4096,6 +4321,56 @@ make_pass_rtl_pre (gcc::context *ctxt) namespace { +const pass_data pass_data_hardreg_pre = +{ + RTL_PASS, /* type */ + "hardreg_pre", /* name */ + OPTGROUP_NONE, /* optinfo_flags */ + TV_PRE, /* tv_id */ + PROP_cfglayout, /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + TODO_df_finish, /* todo_flags_finish */ +}; + +class pass_hardreg_pre : public rtl_opt_pass +{ +public: + pass_hardreg_pre (gcc::context *ctxt) + : rtl_opt_pass (pass_data_hardreg_pre, ctxt) + {} + + /* opt_pass methods: */ + bool gate (function *) final override; + unsigned int execute (function *) final override + { + return execute_hardreg_pre (); + } + +}; // class pass_rtl_pre + +bool +pass_hardreg_pre::gate (function *fun) +{ +#ifdef HARDREG_PRE_REGNOS + return optimize > 0 + && !fun->calls_setjmp; +#else + return false; +#endif +} + +} // anon namespace + +rtl_opt_pass * +make_pass_hardreg_pre (gcc::context *ctxt) +{ + return new pass_hardreg_pre (ctxt); +} + +namespace { + const pass_data pass_data_rtl_hoist = { RTL_PASS, /* type */ diff --git a/gcc/passes.def b/gcc/passes.def index ae85ae72dff734a8698f606254970437e2bf93a5..95d72b22761eec3668a4d5bbcaa8e41fcc4d830a 100644 --- a/gcc/passes.def +++ b/gcc/passes.def @@ -463,6 +463,7 @@ along with GCC; see the file COPYING3. If not see NEXT_PASS (pass_rtl_cprop); NEXT_PASS (pass_rtl_pre); NEXT_PASS (pass_rtl_hoist); + NEXT_PASS (pass_hardreg_pre); NEXT_PASS (pass_rtl_cprop); NEXT_PASS (pass_rtl_store_motion); NEXT_PASS (pass_cse_after_global_opts); diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fpmr-1.c b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-1.c new file mode 100644 index 0000000000000000000000000000000000000000..f7a47f81c5ea4639827d4c902f316932120f44af --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-1.c @@ -0,0 +1,58 @@ +/* { dg-do compile } */ +/* { dg-options "-O1 -march=armv8-a+fp8fma" } */ + +#include <arm_neon.h> + +void foo(float16_t ap[20000], mfloat8x16_t b, mfloat8x16_t c, int br) +{ + float16x8_t a; + a = vld1q_f16(ap); + a = vmlalbq_f16_mf8_fpm(a, b, c, 13); + vst1q_f16(ap, a); + if (br) + { + a = vld1q_f16(ap + 8); + a = vmlalbq_f16_mf8_fpm(a, b, c, 13); + vst1q_f16(ap + 8, a); + a = vld1q_f16(ap + 16); + a = vmlalbq_f16_mf8_fpm(a, b, c, 13); + vst1q_f16(ap + 16, a); + } + else + { + a = vld1q_f16(ap + 24); + a = vmlalbq_f16_mf8_fpm(a, b, c, 13); + vst1q_f16(ap + 24, a); + } + a = vld1q_f16(ap + 32); + a = vmlalbq_f16_mf8_fpm(a, b, c, 13); + vst1q_f16(ap + 32, a); +} + +void bar(float16_t ap[20000], mfloat8x16_t b, mfloat8x16_t c, fpm_t mode, int br) +{ + float16x8_t a; + a = vld1q_f16(ap); + a = vmlalbq_f16_mf8_fpm(a, b, c, mode); + vst1q_f16(ap, a); + if (br) + { + a = vld1q_f16(ap + 8); + a = vmlalbq_f16_mf8_fpm(a, b, c, mode); + vst1q_f16(ap + 8, a); + a = vld1q_f16(ap + 16); + a = vmlalbq_f16_mf8_fpm(a, b, c, mode); + vst1q_f16(ap + 16, a); + } + else + { + a = vld1q_f16(ap + 24); + a = vmlalbq_f16_mf8_fpm(a, b, c, mode); + vst1q_f16(ap + 24, a); + } + a = vld1q_f16(ap + 32); + a = vmlalbq_f16_mf8_fpm(a, b, c, mode); + vst1q_f16(ap + 32, a); +} + +/* { dg-final { scan-assembler-times "msr\tfpmr" 2 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fpmr-2.c b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-2.c new file mode 100644 index 0000000000000000000000000000000000000000..c5b255b0a9a8ea9161217b22f19adaf58c899dbb --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-2.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-options "-O1 -march=armv8-a+fp8fma" } */ + +#include <arm_neon.h> + +void foo(float16_t ap[20000], mfloat8x16_t b, mfloat8x16_t c) +{ + for (int i = 0; i < 103; i++) + { + float16x8_t a = vld1q_f16(ap + 8*i); + a = vmlalbq_f16_mf8_fpm(a, b, c, 13); + vst1q_f16(ap + 8*i, a); + } +} +/* { dg-final { scan-assembler "msr\tfpmr.*\n\.L2" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fpmr-3.c b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-3.c new file mode 100644 index 0000000000000000000000000000000000000000..73a79ad4b44e2b950cf7ea3e914254b5fdc05b69 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-3.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-O1 -march=armv8-a+fp8fma" } */ + +#include <arm_neon.h> + +void foo(float16_t ap[20000], mfloat8x16_t b, mfloat8x16_t c, fpm_t mode) +{ + float16x8_t x = vld1q_f16(ap + 1); + x = vmlalbq_f16_mf8_fpm(x, b, c, mode); + vst1q_f16(ap + 1, x); + for (int i = 0; i < 103; i++) + { + float16x8_t a = vld1q_f16(ap + 8*i); + a = vmlalbq_f16_mf8_fpm(a, b, c, mode); + vst1q_f16(ap + 8*i, a); + } +} +/* { dg-final { scan-assembler-times "msr\tfpmr" 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fpmr-4.c b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-4.c new file mode 100644 index 0000000000000000000000000000000000000000..18c1def752f557e98868250cd73442fb9f556e18 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-4.c @@ -0,0 +1,23 @@ +/* { dg-do compile } */ +/* { dg-options "-O1 -march=armv8-a+fp8fma" } */ + +#include <arm_neon.h> + +void baz(float16_t ap[20000], mfloat8x16_t b, mfloat8x16_t c) +{ + float16x8_t x = vld1q_f16(ap + 1); + x = vmlalbq_f16_mf8_fpm(x, b, c, 13); + vst1q_f16(ap + 1, x); + for (int i = 0; i < 10; i++) + { + float16x8_t a = vld1q_f16(ap + 16*i); + a = vmlalbq_f16_mf8_fpm(a, b, c, 13); + vst1q_f16(ap + 16*i, a); + a = vld1q_f16(ap + 16*i + 8); + a = vmlalbq_f16_mf8_fpm(a, b, c, 865); + vst1q_f16(ap + 16*i+8, a); + } +} + +/* { dg-final { scan-assembler-times "msr\tfpmr" 3 } } */ +/* { dg-final { scan-assembler "msr\tfpmr.*\n\tb\t" } } */ diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h index ce463629194a7298b70da6463706caea0b28dabd..797d719b2c45ffa2d71c7e94687bf1d5ac19c69f 100644 --- a/gcc/tree-pass.h +++ b/gcc/tree-pass.h @@ -573,6 +573,7 @@ extern rtl_opt_pass *make_pass_rtl_dse3 (gcc::context *ctxt); extern rtl_opt_pass *make_pass_rtl_cprop (gcc::context *ctxt); extern rtl_opt_pass *make_pass_rtl_pre (gcc::context *ctxt); extern rtl_opt_pass *make_pass_rtl_hoist (gcc::context *ctxt); +extern rtl_opt_pass *make_pass_hardreg_pre (gcc::context *ctxt); extern rtl_opt_pass *make_pass_rtl_avoid_store_forwarding (gcc::context *ctxt); extern rtl_opt_pass *make_pass_rtl_store_motion (gcc::context *ctxt); extern rtl_opt_pass *make_pass_cse_after_global_opts (gcc::context *ctxt);