Both patterns used !reload_completed as a condition, which is questionable at best. The branch pattern failed to include a clobber of CC_REGNUM. Both problems were unlikely to trigger in practice, due to how the optimization pipeline is organized, but let's fix them anyway.
gcc: * config/aarch64/aarch64.cc (aarch64_gen_compare_split_imm24): New. * config/aarch64/aarch64-proto.h: Update. * config/aarch64/aarch64.md (*aarch64_bcond_wide_imm<GPI>): Use it. Add match_scratch and cc clobbers. Use match_operator instead of iterator expansion. (*compare_cstore<GPI>_insn): Likewise. --- gcc/config/aarch64/aarch64-protos.h | 1 + gcc/config/aarch64/aarch64.cc | 37 +++++++++++++++ gcc/config/aarch64/aarch64.md | 74 ++++++++++------------------- 3 files changed, 63 insertions(+), 49 deletions(-) diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 7b9b16bd3bd..d26e1d5642e 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -1098,6 +1098,7 @@ bool aarch64_legitimate_address_p (machine_mode, rtx, bool, aarch64_addr_query_type = ADDR_QUERY_M); machine_mode aarch64_select_cc_mode (RTX_CODE, rtx, rtx); rtx aarch64_gen_compare_reg (RTX_CODE, rtx, rtx); +rtx aarch64_gen_compare_split_imm24 (rtx, rtx, rtx); bool aarch64_maxmin_plus_const (rtx_code, rtx *, bool); rtx aarch64_load_tp (rtx); diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index c12365868b7..650da2ff95d 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -2882,6 +2882,43 @@ aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y, return aarch64_gen_compare_reg (code, x, y); } +/* Split IMM into two 12-bit halves, producing an EQ/NE comparison vs X. + TMP may be a scratch. This optimizes a sequence from + mov x0, #imm1 + movk x0, #imm2, lsl 16 -- x0 contains CST + cmp x1, x0 + into the shorter: + sub tmp, x1, #(CST & 0xfff000) + subs tmp, tmp, #(CST & 0x000fff) +*/ +rtx +aarch64_gen_compare_split_imm24 (rtx x, rtx imm, rtx tmp) +{ + HOST_WIDE_INT lo_imm = UINTVAL (imm) & 0xfff; + HOST_WIDE_INT hi_imm = UINTVAL (imm) & 0xfff000; + enum machine_mode mode = GET_MODE (x); + + if (GET_CODE (tmp) == SCRATCH) + tmp = gen_reg_rtx (mode); + + emit_insn (gen_add3_insn (tmp, x, GEN_INT (-hi_imm))); + /* TODO: We don't need the gpr result of the second insn. */ + switch (mode) + { + case SImode: + tmp = gen_addsi3_compare0 (tmp, tmp, GEN_INT (-lo_imm)); + break; + case DImode: + tmp = gen_adddi3_compare0 (tmp, tmp, GEN_INT (-lo_imm)); + break; + default: + abort (); + } + emit_insn (tmp); + + return gen_rtx_REG (CC_NZmode, CC_REGNUM); +} + /* Generate conditional branch to LABEL, comparing X to 0 using CODE. Return the jump instruction. */ diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index c7b1b8b3860..ec7dea8de31 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -978,35 +978,24 @@ (const_string "yes")))] ) -;; For a 24-bit immediate CST we can optimize the compare for equality -;; and branch sequence from: -;; mov x0, #imm1 -;; movk x0, #imm2, lsl 16 /* x0 contains CST. */ -;; cmp x1, x0 -;; b<ne,eq> .Label -;; into the shorter: -;; sub x0, x1, #(CST & 0xfff000) -;; subs x0, x0, #(CST & 0x000fff) -;; b<ne,eq> .Label +;; For a 24-bit immediate CST we can optimize the compare for equality. (define_insn_and_split "*aarch64_bcond_wide_imm<GPI:mode>" - [(set (pc) (if_then_else (EQL (match_operand:GPI 0 "register_operand" "r") - (match_operand:GPI 1 "aarch64_split_imm24" "n")) - (label_ref:P (match_operand 2)) - (pc)))] - "!reload_completed" + [(set (pc) (if_then_else + (match_operator 0 "aarch64_equality_operator" + [(match_operand:GPI 1 "register_operand" "r") + (match_operand:GPI 2 "aarch64_split_imm24" "n")]) + (label_ref (match_operand 3)) + (pc))) + (clobber (reg:CC CC_REGNUM)) + (clobber (match_scratch:GPI 4 "=r"))] + "" "#" - "&& true" + "" [(const_int 0)] { - HOST_WIDE_INT lo_imm = UINTVAL (operands[1]) & 0xfff; - HOST_WIDE_INT hi_imm = UINTVAL (operands[1]) & 0xfff000; - rtx tmp = gen_reg_rtx (<GPI:MODE>mode); - emit_insn (gen_add<GPI:mode>3 (tmp, operands[0], GEN_INT (-hi_imm))); - emit_insn (gen_add<GPI:mode>3_compare0 (tmp, tmp, GEN_INT (-lo_imm))); - rtx cc_reg = gen_rtx_REG (CC_NZmode, CC_REGNUM); - rtx cmp_rtx = gen_rtx_fmt_ee (<EQL:CMP>, <GPI:MODE>mode, - cc_reg, const0_rtx); - emit_jump_insn (gen_aarch64_bcond (cmp_rtx, cc_reg, operands[2])); + rtx cc_reg = aarch64_gen_compare_split_imm24 (operands[1], operands[2], + operands[4]); + emit_jump_insn (gen_aarch64_bcond (operands[0], cc_reg, operands[3])); DONE; } ) @@ -4631,37 +4620,24 @@ [(set_attr "type" "csel")] ) -;; For a 24-bit immediate CST we can optimize the compare for equality -;; and branch sequence from: -;; mov x0, #imm1 -;; movk x0, #imm2, lsl 16 /* x0 contains CST. */ -;; cmp x1, x0 -;; cset x2, <ne,eq> -;; into the shorter: -;; sub x0, x1, #(CST & 0xfff000) -;; subs x0, x0, #(CST & 0x000fff) -;; cset x2, <ne, eq>. +;; For a 24-bit immediate CST we can optimize the compare for equality. (define_insn_and_split "*compare_cstore<mode>_insn" [(set (match_operand:GPI 0 "register_operand" "=r") - (EQL:GPI (match_operand:GPI 1 "register_operand" "r") - (match_operand:GPI 2 "aarch64_split_imm24" "n"))) - (clobber (reg:CC CC_REGNUM))] - "!reload_completed" + (match_operator:GPI 1 "aarch64_equality_operator" + [(match_operand:GPI 2 "register_operand" "r") + (match_operand:GPI 3 "aarch64_split_imm24" "n")])) + (clobber (reg:CC CC_REGNUM)) + (clobber (match_scratch:GPI 4 "=r"))] + "" "#" - "&& true" + "" [(const_int 0)] { - HOST_WIDE_INT lo_imm = UINTVAL (operands[2]) & 0xfff; - HOST_WIDE_INT hi_imm = UINTVAL (operands[2]) & 0xfff000; - rtx tmp = gen_reg_rtx (<MODE>mode); - emit_insn (gen_add<mode>3 (tmp, operands[1], GEN_INT (-hi_imm))); - emit_insn (gen_add<mode>3_compare0 (tmp, tmp, GEN_INT (-lo_imm))); - rtx cc_reg = gen_rtx_REG (CC_NZmode, CC_REGNUM); - rtx cmp_rtx = gen_rtx_fmt_ee (<EQL:CMP>, <MODE>mode, cc_reg, const0_rtx); - emit_insn (gen_aarch64_cstore<mode> (operands[0], cmp_rtx, cc_reg)); + rtx cc_reg = aarch64_gen_compare_split_imm24 (operands[2], operands[3], + operands[4]); + emit_insn (gen_aarch64_cstore<mode> (operands[0], operands[1], cc_reg)); DONE; } - [(set_attr "type" "csel")] ) ;; zero_extend version of the above -- 2.34.1