Re: [PATCH][AArch64] Improve immediate expansion [PR106583]

2022-10-21 Thread Richard Sandiford via Gcc-patches
Wilco Dijkstra  writes:
> Hi Richard,
>
>> Can you do the aarch64_mov_imm changes as a separate patch?  It's difficult
>> to review the two changes folded together like this.
>
> Sure, I'll send a separate patch. So here is version 2 again:

I still think we should move the functions to avoid the forward
declarations.  That part was fine (and OK to review).  It was folding
in the extra changes to the way that we generate move immediates that
made it difficult.

Could you send a patch that makes only the changes in v2, but moves
the functions around?  In fact, the positioning of the functions
in the v3 patch looked good, so the patch is OK with the contents
of v2 but the positioning of v3.

Thanks,
Richard

> [PATCH v2][AArch64] Improve immediate expansion [PR106583]
>
> Improve immediate expansion of immediates which can be created from a
> bitmask immediate and 2 MOVKs.  Simplify, refactor and improve
> efficiency of bitmask checks.  This reduces the number of 4-instruction
> immediates in SPECINT/FP by 10-15%.
>
> Passes regress, OK for commit?
>
> gcc/ChangeLog:
>
> PR target/106583
> * config/aarch64/aarch64.cc (aarch64_internal_mov_immediate)
> Add support for a bitmask immediate with 2 MOVKs.
> (aarch64_check_bitmask): New function after refactorization.
> (aarch64_replicate_bitmask_imm): Remove function, merge into...
> (aarch64_bitmask_imm): Simplify replication of small modes.
> Split function into 64-bit only version for efficiency.
>
> gcc/testsuite:
> PR target/106583
> * gcc.target/aarch64/pr106583.c: Add new test.
>
> ---
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 
> 926e81f028c82aac9a5fecc18f921f84399c24ae..b2d9c7380975028131d0fe731a97b3909874b87b
>  100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -306,6 +306,7 @@ static machine_mode aarch64_simd_container_mode 
> (scalar_mode, poly_int64);
>  static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
>  aarch64_addr_query_type);
>  static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
> +static bool aarch64_bitmask_imm (unsigned HOST_WIDE_INT);
>
>  /* The processor for which instructions should be scheduled.  */
>  enum aarch64_processor aarch64_tune = cortexa53;
> @@ -5502,6 +5503,30 @@ aarch64_output_sve_vector_inc_dec (const char 
> *operands, rtx x)
>   factor, nelts_per_vq);
>  }
>
> +/* Return true if the immediate VAL can be a bitfield immediate
> +   by changing the given MASK bits in VAL to zeroes, ones or bits
> +   from the other half of VAL.  Return the new immediate in VAL2.  */
> +static inline bool
> +aarch64_check_bitmask (unsigned HOST_WIDE_INT val,
> +  unsigned HOST_WIDE_INT ,
> +  unsigned HOST_WIDE_INT mask)
> +{
> +  val2 = val & ~mask;
> +  if (val2 != val && aarch64_bitmask_imm (val2))
> +return true;
> +  val2 = val | mask;
> +  if (val2 != val && aarch64_bitmask_imm (val2))
> +return true;
> +  val = val & ~mask;
> +  val2 = val | (((val >> 32) | (val << 32)) & mask);
> +  if (val2 != val && aarch64_bitmask_imm (val2))
> +return true;
> +  val2 = val | (((val >> 16) | (val << 48)) & mask);
> +  if (val2 != val && aarch64_bitmask_imm (val2))
> +return true;
> +  return false;
> +}
> +
>  static int
>  aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
>  scalar_int_mode mode)
> @@ -5568,36 +5593,43 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, 
> bool generate,
>one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
>  ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
>
> -  if (zero_match != 2 && one_match != 2)
> +  if (zero_match < 2 && one_match < 2)
>  {
>/* Try emitting a bitmask immediate with a movk replacing 16 bits.
>   For a 64-bit bitmask try whether changing 16 bits to all ones or
>   zeroes creates a valid bitmask.  To check any repeated bitmask,
>   try using 16 bits from the other 32-bit half of val.  */
>
> -  for (i = 0; i < 64; i += 16, mask <<= 16)
> -   {
> - val2 = val & ~mask;
> - if (val2 != val && aarch64_bitmask_imm (val2, mode))
> -   break;
> - val2 = val | mask;
> - if (val2 != val && aarch64_bitmask_imm (val2, mode))
> -   break;
> - val2 = val2 & ~mask;
> - val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
> - if (val2 != val && aarch64_bitmask_imm (val2, mode))
> -   break;
> -   }
> -  if (i != 64)
> -   {
> - if (generate)
> +  for (i = 0; i < 64; i += 16)
> +   if (aarch64_check_bitmask (val, val2, mask << i))
> + {
> +   if (generate)
> + {
> +   emit_insn (gen_rtx_SET (dest, 

Re: [PATCH][AArch64] Improve immediate expansion [PR106583]

2022-10-20 Thread Wilco Dijkstra via Gcc-patches
Hi Richard,

> Can you do the aarch64_mov_imm changes as a separate patch?  It's difficult
> to review the two changes folded together like this.

Sure, I'll send a separate patch. So here is version 2 again:

[PATCH v2][AArch64] Improve immediate expansion [PR106583]

Improve immediate expansion of immediates which can be created from a
bitmask immediate and 2 MOVKs.  Simplify, refactor and improve
efficiency of bitmask checks.  This reduces the number of 4-instruction
immediates in SPECINT/FP by 10-15%.

Passes regress, OK for commit?

gcc/ChangeLog:

PR target/106583
* config/aarch64/aarch64.cc (aarch64_internal_mov_immediate)
Add support for a bitmask immediate with 2 MOVKs.
(aarch64_check_bitmask): New function after refactorization.
(aarch64_replicate_bitmask_imm): Remove function, merge into...
(aarch64_bitmask_imm): Simplify replication of small modes.
Split function into 64-bit only version for efficiency.

gcc/testsuite:
PR target/106583
* gcc.target/aarch64/pr106583.c: Add new test.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
926e81f028c82aac9a5fecc18f921f84399c24ae..b2d9c7380975028131d0fe731a97b3909874b87b
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -306,6 +306,7 @@ static machine_mode aarch64_simd_container_mode 
(scalar_mode, poly_int64);
 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 aarch64_addr_query_type);
 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
+static bool aarch64_bitmask_imm (unsigned HOST_WIDE_INT);
 
 /* The processor for which instructions should be scheduled.  */
 enum aarch64_processor aarch64_tune = cortexa53;
@@ -5502,6 +5503,30 @@ aarch64_output_sve_vector_inc_dec (const char *operands, 
rtx x)
  factor, nelts_per_vq);
 }
 
+/* Return true if the immediate VAL can be a bitfield immediate
+   by changing the given MASK bits in VAL to zeroes, ones or bits
+   from the other half of VAL.  Return the new immediate in VAL2.  */
+static inline bool
+aarch64_check_bitmask (unsigned HOST_WIDE_INT val,
+  unsigned HOST_WIDE_INT ,
+  unsigned HOST_WIDE_INT mask)
+{
+  val2 = val & ~mask;
+  if (val2 != val && aarch64_bitmask_imm (val2))
+return true;
+  val2 = val | mask;
+  if (val2 != val && aarch64_bitmask_imm (val2))
+return true;
+  val = val & ~mask;
+  val2 = val | (((val >> 32) | (val << 32)) & mask);
+  if (val2 != val && aarch64_bitmask_imm (val2))
+return true;
+  val2 = val | (((val >> 16) | (val << 48)) & mask);
+  if (val2 != val && aarch64_bitmask_imm (val2))
+return true;
+  return false;
+}
+
 static int
 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
 scalar_int_mode mode)
@@ -5568,36 +5593,43 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool 
generate,
   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
 
-  if (zero_match != 2 && one_match != 2)
+  if (zero_match < 2 && one_match < 2)
 {
   /* Try emitting a bitmask immediate with a movk replacing 16 bits.
  For a 64-bit bitmask try whether changing 16 bits to all ones or
  zeroes creates a valid bitmask.  To check any repeated bitmask,
  try using 16 bits from the other 32-bit half of val.  */
 
-  for (i = 0; i < 64; i += 16, mask <<= 16)
-   {
- val2 = val & ~mask;
- if (val2 != val && aarch64_bitmask_imm (val2, mode))
-   break;
- val2 = val | mask;
- if (val2 != val && aarch64_bitmask_imm (val2, mode))
-   break;
- val2 = val2 & ~mask;
- val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
- if (val2 != val && aarch64_bitmask_imm (val2, mode))
-   break;
-   }
-  if (i != 64)
-   {
- if (generate)
+  for (i = 0; i < 64; i += 16)
+   if (aarch64_check_bitmask (val, val2, mask << i))
+ {
+   if (generate)
+ {
+   emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
+   emit_insn (gen_insv_immdi (dest, GEN_INT (i),
+  GEN_INT ((val >> i) & 0x)));
+ }
+   return 2;
+ }
+}
+
+  /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions.  */
+  if (zero_match + one_match == 0)
+{
+  for (i = 0; i < 48; i += 16)
+   for (int j = i + 16; j < 64; j += 16)
+ if (aarch64_check_bitmask (val, val2, (mask << i) | (mask << j)))
 {
- emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
- emit_insn (gen_insv_immdi (dest, GEN_INT (i),
-GEN_INT ((val >> i) & 0x)));
+ 

Re: [PATCH][AArch64] Improve immediate expansion [PR106583]

2022-10-20 Thread Richard Sandiford via Gcc-patches
Wilco Dijkstra  writes:
> ping
>
>
>
> Hi Richard,
>
 Sounds good, but could you put it before the mode version,
 to avoid the forward declaration?
>>>
>>> I can swap them around but the forward declaration is still required as
>>> aarch64_check_bitmask is 5000 lines before aarch64_bitmask_imm.
>>
>> OK, how about moving them both above aarch64_check_bitmask?
>
> Sure I've moved them as well as all related helper functions - it makes the 
> diff
> quite large but they are all together now which makes sense. I also refactored
> aarch64_mov_imm to handle the case of a 64-bit immediate being generated
> by a 32-bit MOVZ/MOVN - this simplifies aarch64_internal_move_immediate
> and movdi patterns even further.

Can you do the aarch64_mov_imm changes as a separate patch?  It's difficult
to review the two changes folded together like this.

Thanks,
Richard

>
> Cheers,
> Wilco
>
> v3: move immediate code together and avoid forward declarations,
> further cleanups and simplifications.
>
> Improve immediate expansion of immediates which can be created from a
> bitmask immediate and 2 MOVKs.  Simplify, refactor and improve
> efficiency of bitmask checks and move immediate. Move various immediate
> handling functions together to avoid forward declarations.
> Include 32-bit MOVZ/N as valid 64-bit immediates. Add new constraint so
> the movdi pattern only needs a single alternative for move immediate.
>
> This reduces the number of 4-instruction immediates in SPECINT/FP by 10-15%.
>
> Passes bootstrap & regress, OK for commit?
>
> gcc/ChangeLog:
>
> PR target/106583
> * config/aarch64/aarch64.cc (aarch64_internal_mov_immediate)
> Add support for a bitmask immediate with 2 MOVKs.
> (aarch64_check_bitmask): New function after refactorization.
> (aarch64_replicate_bitmask_imm): Remove function, merge into...
> (aarch64_bitmask_imm): Simplify replication of small modes.
> Split function into 64-bit only version for efficiency.
> (aarch64_zeroextended_move_imm): New function.
> (aarch64_move_imm): Refactor code.
> (aarch64_uimm12_shift): Move near other immediate functions.
> (aarch64_clamp_to_uimm12_shift): Likewise.
> (aarch64_movk_shift): Likewise.
> (aarch64_replicate_bitmask_imm): Likewise.
> (aarch64_and_split_imm1): Likewise.
> (aarch64_and_split_imm2): Likewise.
> (aarch64_and_bitmask_imm): Likewise.
> (aarch64_movw_imm): Remove.
> * config/aarch64/aarch64.md (movdi_aarch64): Merge 'N' and 'M'
> constraints into single 'O'.
> (mov_aarch64): Likewise.
> * config/aarch64/aarch64-protos.h (aarch64_move_imm): Use unsigned.
> (aarch64_bitmask_imm): Likewise.
> (aarch64_uimm12_shift): Likewise.
> (aarch64_zeroextended_move_imm): New prototype.
> * config/aarch64/constraints.md: Add 'O' for 32/64-bit immediates,
> limit 'N' to 64-bit only moves.
>
> gcc/testsuite:
> PR target/106583
> * gcc.target/aarch64/pr106583.c: Add new test.
>
> ---
>
> diff --git a/gcc/config/aarch64/aarch64-protos.h 
> b/gcc/config/aarch64/aarch64-protos.h
> index 
> 3e4005c9f4ff1f999f1811c6fb0b2252878dc4ae..b82f9ba7c2bb4cffa16abbf45f87061f72015083
>  100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -755,7 +755,7 @@ void aarch64_post_cfi_startproc (void);
>  poly_int64 aarch64_initial_elimination_offset (unsigned, unsigned);
>  int aarch64_get_condition_code (rtx);
>  bool aarch64_address_valid_for_prefetch_p (rtx, bool);
> -bool aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode);
> +bool aarch64_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode);
>  unsigned HOST_WIDE_INT aarch64_and_split_imm1 (HOST_WIDE_INT val_in);
>  unsigned HOST_WIDE_INT aarch64_and_split_imm2 (HOST_WIDE_INT val_in);
>  bool aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode 
> mode);
> @@ -792,7 +792,7 @@ bool aarch64_masks_and_shift_for_bfi_p (scalar_int_mode, 
> unsigned HOST_WIDE_INT,
>  unsigned HOST_WIDE_INT,
>  unsigned HOST_WIDE_INT);
>  bool aarch64_zero_extend_const_eq (machine_mode, rtx, machine_mode, rtx);
> -bool aarch64_move_imm (HOST_WIDE_INT, machine_mode);
> +bool aarch64_move_imm (unsigned HOST_WIDE_INT, machine_mode);
>  machine_mode aarch64_sve_int_mode (machine_mode);
>  opt_machine_mode aarch64_sve_pred_mode (unsigned int);
>  machine_mode aarch64_sve_pred_mode (machine_mode);
> @@ -842,8 +842,9 @@ bool aarch64_sve_float_arith_immediate_p (rtx, bool);
>  bool aarch64_sve_float_mul_immediate_p (rtx);
>  bool aarch64_split_dimode_const_store (rtx, rtx);
>  bool aarch64_symbolic_address_p (rtx);
> -bool aarch64_uimm12_shift (HOST_WIDE_INT);
> +bool aarch64_uimm12_shift (unsigned HOST_WIDE_INT);
>  int aarch64_movk_shift (const wide_int_ref &, const wide_int_ref &);
> +bool 

Re: [PATCH][AArch64] Improve immediate expansion [PR106583]

2022-10-19 Thread Wilco Dijkstra via Gcc-patches
ping



Hi Richard,

>>> Sounds good, but could you put it before the mode version,
>>> to avoid the forward declaration?
>>
>> I can swap them around but the forward declaration is still required as
>> aarch64_check_bitmask is 5000 lines before aarch64_bitmask_imm.
>
> OK, how about moving them both above aarch64_check_bitmask?

Sure I've moved them as well as all related helper functions - it makes the diff
quite large but they are all together now which makes sense. I also refactored
aarch64_mov_imm to handle the case of a 64-bit immediate being generated
by a 32-bit MOVZ/MOVN - this simplifies aarch64_internal_move_immediate
and movdi patterns even further.

Cheers,
Wilco

v3: move immediate code together and avoid forward declarations,
further cleanups and simplifications.

Improve immediate expansion of immediates which can be created from a
bitmask immediate and 2 MOVKs.  Simplify, refactor and improve 
efficiency of bitmask checks and move immediate. Move various immediate
handling functions together to avoid forward declarations.
Include 32-bit MOVZ/N as valid 64-bit immediates. Add new constraint so
the movdi pattern only needs a single alternative for move immediate.

This reduces the number of 4-instruction immediates in SPECINT/FP by 10-15%.

Passes bootstrap & regress, OK for commit?

gcc/ChangeLog:

    PR target/106583
    * config/aarch64/aarch64.cc (aarch64_internal_mov_immediate)
    Add support for a bitmask immediate with 2 MOVKs.
    (aarch64_check_bitmask): New function after refactorization.
    (aarch64_replicate_bitmask_imm): Remove function, merge into...
    (aarch64_bitmask_imm): Simplify replication of small modes.
    Split function into 64-bit only version for efficiency.
    (aarch64_zeroextended_move_imm): New function.
    (aarch64_move_imm): Refactor code.
    (aarch64_uimm12_shift): Move near other immediate functions.
    (aarch64_clamp_to_uimm12_shift): Likewise.
    (aarch64_movk_shift): Likewise.
    (aarch64_replicate_bitmask_imm): Likewise.
    (aarch64_and_split_imm1): Likewise.
    (aarch64_and_split_imm2): Likewise.
    (aarch64_and_bitmask_imm): Likewise.
    (aarch64_movw_imm): Remove.
    * config/aarch64/aarch64.md (movdi_aarch64): Merge 'N' and 'M'
    constraints into single 'O'.
    (mov_aarch64): Likewise.
    * config/aarch64/aarch64-protos.h (aarch64_move_imm): Use unsigned.
    (aarch64_bitmask_imm): Likewise.
    (aarch64_uimm12_shift): Likewise.
    (aarch64_zeroextended_move_imm): New prototype.
    * config/aarch64/constraints.md: Add 'O' for 32/64-bit immediates,
    limit 'N' to 64-bit only moves.

gcc/testsuite:
    PR target/106583
    * gcc.target/aarch64/pr106583.c: Add new test.

---

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 
3e4005c9f4ff1f999f1811c6fb0b2252878dc4ae..b82f9ba7c2bb4cffa16abbf45f87061f72015083
 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -755,7 +755,7 @@ void aarch64_post_cfi_startproc (void);
 poly_int64 aarch64_initial_elimination_offset (unsigned, unsigned);
 int aarch64_get_condition_code (rtx);
 bool aarch64_address_valid_for_prefetch_p (rtx, bool);
-bool aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode);
+bool aarch64_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode);
 unsigned HOST_WIDE_INT aarch64_and_split_imm1 (HOST_WIDE_INT val_in);
 unsigned HOST_WIDE_INT aarch64_and_split_imm2 (HOST_WIDE_INT val_in);
 bool aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode 
mode);
@@ -792,7 +792,7 @@ bool aarch64_masks_and_shift_for_bfi_p (scalar_int_mode, 
unsigned HOST_WIDE_INT,
 unsigned HOST_WIDE_INT,
 unsigned HOST_WIDE_INT);
 bool aarch64_zero_extend_const_eq (machine_mode, rtx, machine_mode, rtx);
-bool aarch64_move_imm (HOST_WIDE_INT, machine_mode);
+bool aarch64_move_imm (unsigned HOST_WIDE_INT, machine_mode);
 machine_mode aarch64_sve_int_mode (machine_mode);
 opt_machine_mode aarch64_sve_pred_mode (unsigned int);
 machine_mode aarch64_sve_pred_mode (machine_mode);
@@ -842,8 +842,9 @@ bool aarch64_sve_float_arith_immediate_p (rtx, bool);
 bool aarch64_sve_float_mul_immediate_p (rtx);
 bool aarch64_split_dimode_const_store (rtx, rtx);
 bool aarch64_symbolic_address_p (rtx);
-bool aarch64_uimm12_shift (HOST_WIDE_INT);
+bool aarch64_uimm12_shift (unsigned HOST_WIDE_INT);
 int aarch64_movk_shift (const wide_int_ref &, const wide_int_ref &);
+bool aarch64_zeroextended_move_imm (unsigned HOST_WIDE_INT);
 bool aarch64_use_return_insn_p (void);
 const char *aarch64_output_casesi (rtx *);
 
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
4de55beb067ea8f0be0a90060a785c94bdee708b..785ec07692981d423582051ac0897e5dbc3a001f
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ 

Re: [PATCH][AArch64] Improve immediate expansion [PR106583]

2022-10-12 Thread Wilco Dijkstra via Gcc-patches
Hi Richard,

>>> Sounds good, but could you put it before the mode version,
>>> to avoid the forward declaration?
>>
>> I can swap them around but the forward declaration is still required as
>> aarch64_check_bitmask is 5000 lines before aarch64_bitmask_imm.
>
> OK, how about moving them both above aarch64_check_bitmask?

Sure I've moved them as well as all related helper functions - it makes the diff
quite large but they are all together now which makes sense. I also refactored
aarch64_mov_imm to handle the case of a 64-bit immediate being generated
by a 32-bit MOVZ/MOVN - this simplifies aarch64_internal_move_immediate
and movdi patterns even further.

Cheers,
Wilco

v3: move immediate code together and avoid forward declarations,
further cleanups and simplifications.

Improve immediate expansion of immediates which can be created from a
bitmask immediate and 2 MOVKs.  Simplify, refactor and improve 
efficiency of bitmask checks and move immediate. Move various immediate
handling functions together to avoid forward declarations.
Include 32-bit MOVZ/N as valid 64-bit immediates. Add new constraint so
the movdi pattern only needs a single alternative for move immediate.

This reduces the number of 4-instruction immediates in SPECINT/FP by 10-15%.

Passes bootstrap & regress, OK for commit?

gcc/ChangeLog:

PR target/106583
* config/aarch64/aarch64.cc (aarch64_internal_mov_immediate)
Add support for a bitmask immediate with 2 MOVKs.
(aarch64_check_bitmask): New function after refactorization.
(aarch64_replicate_bitmask_imm): Remove function, merge into...
(aarch64_bitmask_imm): Simplify replication of small modes.
Split function into 64-bit only version for efficiency.
(aarch64_zeroextended_move_imm): New function.
(aarch64_move_imm): Refactor code.
(aarch64_uimm12_shift): Move near other immediate functions.
(aarch64_clamp_to_uimm12_shift): Likewise.
(aarch64_movk_shift): Likewise.
(aarch64_replicate_bitmask_imm): Likewise.
(aarch64_and_split_imm1): Likewise.
(aarch64_and_split_imm2): Likewise.
(aarch64_and_bitmask_imm): Likewise.
(aarch64_movw_imm): Remove.
* config/aarch64/aarch64.md (movdi_aarch64): Merge 'N' and 'M'
constraints into single 'O'.
(mov_aarch64): Likewise.
* config/aarch64/aarch64-protos.h (aarch64_move_imm): Use unsigned.
(aarch64_bitmask_imm): Likewise.
(aarch64_uimm12_shift): Likewise.
(aarch64_zeroextended_move_imm): New prototype.
* config/aarch64/constraints.md: Add 'O' for 32/64-bit immediates,
limit 'N' to 64-bit only moves.

gcc/testsuite:
PR target/106583
* gcc.target/aarch64/pr106583.c: Add new test.

---

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 
3e4005c9f4ff1f999f1811c6fb0b2252878dc4ae..b82f9ba7c2bb4cffa16abbf45f87061f72015083
 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -755,7 +755,7 @@ void aarch64_post_cfi_startproc (void);
 poly_int64 aarch64_initial_elimination_offset (unsigned, unsigned);
 int aarch64_get_condition_code (rtx);
 bool aarch64_address_valid_for_prefetch_p (rtx, bool);
-bool aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode);
+bool aarch64_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode);
 unsigned HOST_WIDE_INT aarch64_and_split_imm1 (HOST_WIDE_INT val_in);
 unsigned HOST_WIDE_INT aarch64_and_split_imm2 (HOST_WIDE_INT val_in);
 bool aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode 
mode);
@@ -792,7 +792,7 @@ bool aarch64_masks_and_shift_for_bfi_p (scalar_int_mode, 
unsigned HOST_WIDE_INT,
unsigned HOST_WIDE_INT,
unsigned HOST_WIDE_INT);
 bool aarch64_zero_extend_const_eq (machine_mode, rtx, machine_mode, rtx);
-bool aarch64_move_imm (HOST_WIDE_INT, machine_mode);
+bool aarch64_move_imm (unsigned HOST_WIDE_INT, machine_mode);
 machine_mode aarch64_sve_int_mode (machine_mode);
 opt_machine_mode aarch64_sve_pred_mode (unsigned int);
 machine_mode aarch64_sve_pred_mode (machine_mode);
@@ -842,8 +842,9 @@ bool aarch64_sve_float_arith_immediate_p (rtx, bool);
 bool aarch64_sve_float_mul_immediate_p (rtx);
 bool aarch64_split_dimode_const_store (rtx, rtx);
 bool aarch64_symbolic_address_p (rtx);
-bool aarch64_uimm12_shift (HOST_WIDE_INT);
+bool aarch64_uimm12_shift (unsigned HOST_WIDE_INT);
 int aarch64_movk_shift (const wide_int_ref &, const wide_int_ref &);
+bool aarch64_zeroextended_move_imm (unsigned HOST_WIDE_INT);
 bool aarch64_use_return_insn_p (void);
 const char *aarch64_output_casesi (rtx *);
 
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
4de55beb067ea8f0be0a90060a785c94bdee708b..785ec07692981d423582051ac0897e5dbc3a001f
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ 

Re: [PATCH][AArch64] Improve immediate expansion [PR106583]

2022-10-07 Thread Richard Sandiford via Gcc-patches
Wilco Dijkstra  writes:
> Hi Richard,
>
>>> Yes, with a more general search loop we can get that case too -
>>> it doesn't trigger much though. The code that checks for this is
>>> now refactored into a new function. Given there are now many
>>> more calls to aarch64_bitmask_imm, I added a streamlined internal
>>> entry point that assumes the input is 64-bit.
>>
>> Sounds good, but could you put it before the mode version,
>> to avoid the forward declaration?
>
> I can swap them around but the forward declaration is still required as
> aarch64_check_bitmask is 5000 lines before aarch64_bitmask_imm.

OK, how about moving them both above aarch64_check_bitmask?

Richard


Re: [PATCH][AArch64] Improve immediate expansion [PR106583]

2022-10-07 Thread Wilco Dijkstra via Gcc-patches
Hi Richard,

>> Yes, with a more general search loop we can get that case too -
>> it doesn't trigger much though. The code that checks for this is
>> now refactored into a new function. Given there are now many
>> more calls to aarch64_bitmask_imm, I added a streamlined internal
>> entry point that assumes the input is 64-bit.
>
> Sounds good, but could you put it before the mode version,
> to avoid the forward declaration?

I can swap them around but the forward declaration is still required as
aarch64_check_bitmask is 5000 lines before aarch64_bitmask_imm.

Cheers,
Wilco

Re: [PATCH][AArch64] Improve immediate expansion [PR106583]

2022-10-07 Thread Richard Sandiford via Gcc-patches
Wilco Dijkstra via Gcc-patches  writes:
> Hi Richard,
>
>> Did you consider handling the case where the movks aren't for
>> consecutive bitranges?  E.g. the patch handles:
>
>> but it looks like it would be fairly easy to extend it to:
>>
>>  0x12345678
>
> Yes, with a more general search loop we can get that case too -
> it doesn't trigger much though. The code that checks for this is
> now refactored into a new function. Given there are now many
> more calls to aarch64_bitmask_imm, I added a streamlined internal
> entry point that assumes the input is 64-bit.

Sounds good, but could you put it before the mode version,
to avoid the forward declaration?

OK with that change, thanks.

Richard

> Cheers,
> Wilco
>
> [PATCH v2][AArch64] Improve immediate expansion [PR106583]
>
> Improve immediate expansion of immediates which can be created from a
> bitmask immediate and 2 MOVKs.  Simplify, refactor and improve 
> efficiency of bitmask checks.  This reduces the number of 4-instruction
> immediates in SPECINT/FP by 10-15%.
>
> Passes regress, OK for commit?
>
> gcc/ChangeLog:
>
>   PR target/106583
>   * config/aarch64/aarch64.cc (aarch64_internal_mov_immediate)
>   Add support for a bitmask immediate with 2 MOVKs.
> (aarch64_check_bitmask): New function after refactorization.
> (aarch64_replicate_bitmask_imm): Remove function, merge into...
> (aarch64_bitmask_imm): Simplify replication of small modes.
> Split function into 64-bit only version for efficiency. 
>
> gcc/testsuite:
>   PR target/106583
>   * gcc.target/aarch64/pr106583.c: Add new test.
>
> ---
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 
> 926e81f028c82aac9a5fecc18f921f84399c24ae..b2d9c7380975028131d0fe731a97b3909874b87b
>  100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -306,6 +306,7 @@ static machine_mode aarch64_simd_container_mode 
> (scalar_mode, poly_int64);
>  static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
>   aarch64_addr_query_type);
>  static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
> +static bool aarch64_bitmask_imm (unsigned HOST_WIDE_INT);
>  
>  /* The processor for which instructions should be scheduled.  */
>  enum aarch64_processor aarch64_tune = cortexa53;
> @@ -5502,6 +5503,30 @@ aarch64_output_sve_vector_inc_dec (const char 
> *operands, rtx x)
>factor, nelts_per_vq);
>  }
>  
> +/* Return true if the immediate VAL can be a bitfield immediate
> +   by changing the given MASK bits in VAL to zeroes, ones or bits
> +   from the other half of VAL.  Return the new immediate in VAL2.  */
> +static inline bool
> +aarch64_check_bitmask (unsigned HOST_WIDE_INT val,
> +unsigned HOST_WIDE_INT ,
> +unsigned HOST_WIDE_INT mask)
> +{
> +  val2 = val & ~mask;
> +  if (val2 != val && aarch64_bitmask_imm (val2))
> +return true;
> +  val2 = val | mask;
> +  if (val2 != val && aarch64_bitmask_imm (val2))
> +return true;
> +  val = val & ~mask;
> +  val2 = val | (((val >> 32) | (val << 32)) & mask);
> +  if (val2 != val && aarch64_bitmask_imm (val2))
> +return true;
> +  val2 = val | (((val >> 16) | (val << 48)) & mask);
> +  if (val2 != val && aarch64_bitmask_imm (val2))
> +return true;
> +  return false;
> +}
> +
>  static int
>  aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
>   scalar_int_mode mode)
> @@ -5568,36 +5593,43 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, 
> bool generate,
>one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
>  ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
>  
> -  if (zero_match != 2 && one_match != 2)
> +  if (zero_match < 2 && one_match < 2)
>  {
>/* Try emitting a bitmask immediate with a movk replacing 16 bits.
>For a 64-bit bitmask try whether changing 16 bits to all ones or
>zeroes creates a valid bitmask.  To check any repeated bitmask,
>try using 16 bits from the other 32-bit half of val.  */
>  
> -  for (i = 0; i < 64; i += 16, mask <<= 16)
> - {
> -   val2 = val & ~mask;
> -   if (val2 != val && aarch64_bitmask_imm (val2, mode))
> - break;
> -   val2 = val | mask;
> -   if (val2 != val && aarch64_bitmask_imm (val2, mode))
> - break;
> -   val2 = val2 & ~mask;
> -   val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
> -   if (val2 != val && aarch64_bitmask_imm (val2, mode))
> - break;
> - }
> -  if (i != 64)
> - {
> -   if (generate)
> +  for (i = 0; i < 64; i += 16)
> + if (aarch64_check_bitmask (val, val2, mask << i))
> +   {
> + if (generate)
> +   {
> + emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
> + emit_insn 

Re: [PATCH][AArch64] Improve immediate expansion [PR106583]

2022-10-06 Thread Wilco Dijkstra via Gcc-patches
Hi Richard,

> Did you consider handling the case where the movks aren't for
> consecutive bitranges?  E.g. the patch handles:

> but it looks like it would be fairly easy to extend it to:
>
>  0x12345678

Yes, with a more general search loop we can get that case too -
it doesn't trigger much though. The code that checks for this is
now refactored into a new function. Given there are now many
more calls to aarch64_bitmask_imm, I added a streamlined internal
entry point that assumes the input is 64-bit.

Cheers,
Wilco

[PATCH v2][AArch64] Improve immediate expansion [PR106583]

Improve immediate expansion of immediates which can be created from a
bitmask immediate and 2 MOVKs.  Simplify, refactor and improve 
efficiency of bitmask checks.  This reduces the number of 4-instruction
immediates in SPECINT/FP by 10-15%.

Passes regress, OK for commit?

gcc/ChangeLog:

PR target/106583
* config/aarch64/aarch64.cc (aarch64_internal_mov_immediate)
Add support for a bitmask immediate with 2 MOVKs.
(aarch64_check_bitmask): New function after refactorization.
(aarch64_replicate_bitmask_imm): Remove function, merge into...
(aarch64_bitmask_imm): Simplify replication of small modes.
Split function into 64-bit only version for efficiency. 

gcc/testsuite:
PR target/106583
* gcc.target/aarch64/pr106583.c: Add new test.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
926e81f028c82aac9a5fecc18f921f84399c24ae..b2d9c7380975028131d0fe731a97b3909874b87b
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -306,6 +306,7 @@ static machine_mode aarch64_simd_container_mode 
(scalar_mode, poly_int64);
 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
aarch64_addr_query_type);
 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
+static bool aarch64_bitmask_imm (unsigned HOST_WIDE_INT);
 
 /* The processor for which instructions should be scheduled.  */
 enum aarch64_processor aarch64_tune = cortexa53;
@@ -5502,6 +5503,30 @@ aarch64_output_sve_vector_inc_dec (const char *operands, 
rtx x)
 factor, nelts_per_vq);
 }
 
+/* Return true if the immediate VAL can be a bitfield immediate
+   by changing the given MASK bits in VAL to zeroes, ones or bits
+   from the other half of VAL.  Return the new immediate in VAL2.  */
+static inline bool
+aarch64_check_bitmask (unsigned HOST_WIDE_INT val,
+  unsigned HOST_WIDE_INT ,
+  unsigned HOST_WIDE_INT mask)
+{
+  val2 = val & ~mask;
+  if (val2 != val && aarch64_bitmask_imm (val2))
+return true;
+  val2 = val | mask;
+  if (val2 != val && aarch64_bitmask_imm (val2))
+return true;
+  val = val & ~mask;
+  val2 = val | (((val >> 32) | (val << 32)) & mask);
+  if (val2 != val && aarch64_bitmask_imm (val2))
+return true;
+  val2 = val | (((val >> 16) | (val << 48)) & mask);
+  if (val2 != val && aarch64_bitmask_imm (val2))
+return true;
+  return false;
+}
+
 static int
 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
scalar_int_mode mode)
@@ -5568,36 +5593,43 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool 
generate,
   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
 
-  if (zero_match != 2 && one_match != 2)
+  if (zero_match < 2 && one_match < 2)
 {
   /* Try emitting a bitmask immediate with a movk replacing 16 bits.
 For a 64-bit bitmask try whether changing 16 bits to all ones or
 zeroes creates a valid bitmask.  To check any repeated bitmask,
 try using 16 bits from the other 32-bit half of val.  */
 
-  for (i = 0; i < 64; i += 16, mask <<= 16)
-   {
- val2 = val & ~mask;
- if (val2 != val && aarch64_bitmask_imm (val2, mode))
-   break;
- val2 = val | mask;
- if (val2 != val && aarch64_bitmask_imm (val2, mode))
-   break;
- val2 = val2 & ~mask;
- val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
- if (val2 != val && aarch64_bitmask_imm (val2, mode))
-   break;
-   }
-  if (i != 64)
-   {
- if (generate)
+  for (i = 0; i < 64; i += 16)
+   if (aarch64_check_bitmask (val, val2, mask << i))
+ {
+   if (generate)
+ {
+   emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
+   emit_insn (gen_insv_immdi (dest, GEN_INT (i),
+  GEN_INT ((val >> i) & 0x)));
+ }
+   return 2;
+ }
+}
+
+  /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions.  */
+  if (zero_match + one_match == 0)
+{
+  for (i = 0; i < 48; i += 16)
+   for (int 

Re: [PATCH][AArch64] Improve immediate expansion [PR106583]

2022-10-05 Thread Richard Sandiford via Gcc-patches
Wilco Dijkstra  writes:
> Improve immediate expansion of immediates which can be created from a
> bitmask immediate and 2 MOVKs.  This reduces the number of 4-instruction
> immediates in SPECINT/FP by 10-15%.
>
> Passes regress, OK for commit?
>
> gcc/ChangeLog:
>
> PR target/106583
> * config/aarch64/aarch64.cc (aarch64_internal_mov_immediate)
> Add support for a bitmask immediate with 2 MOVKs.
>
> gcc/testsuite:
> PR target/106583
> * gcc.target/aarch64/pr106583.c: Add new test.

Nice.

Did you consider handling the case where the movks aren't for
consecutive bitranges?  E.g. the patch handles:

  0x12345678

and:

  0x12345678

but it looks like it would be fairly easy to extend it to:

  0x12345678

too.

Also, could you commonise:

  val2 = val & ~mask;
  if (val2 != val && aarch64_bitmask_imm (val2, mode))
break;
  val2 = val | mask;
  if (val2 != val && aarch64_bitmask_imm (val2, mode))
break;
  val2 = val2 & ~mask;
  val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
  if (val2 != val && aarch64_bitmask_imm (val2, mode))
break;

?  It's subtle enough that IMO it'd be better not to cut-&-paste it.

Thanks,
Richard

> ---
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 
> 926e81f028c82aac9a5fecc18f921f84399c24ae..1601d11710cb6132c80a77bb4fe2f8429519aa5a
>  100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -5568,7 +5568,7 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool 
> generate,
>one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
>  ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
>
> -  if (zero_match != 2 && one_match != 2)
> +  if (zero_match < 2 && one_match < 2)
>  {
>/* Try emitting a bitmask immediate with a movk replacing 16 bits.
>  For a 64-bit bitmask try whether changing 16 bits to all ones or
> @@ -5600,6 +5600,43 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, 
> bool generate,
> }
>  }
>
> +  /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions.  
> */
> +  if (zero_match + one_match == 0)
> +{
> +  mask = 0x;
> +
> +  for (i = 0; i < 64; i += 16)
> +   {
> + val2 = val & ~mask;
> + if (aarch64_bitmask_imm (val2, mode))
> +   break;
> + val2 = val | mask;
> + if (aarch64_bitmask_imm (val2, mode))
> +   break;
> + val2 = val2 & ~mask;
> + val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
> + if (aarch64_bitmask_imm (val2, mode))
> +   break;
> +
> + mask = (mask << 16) | (mask >> 48);
> +   }
> +
> +  if (i != 64)
> +   {
> + if (generate)
> +   {
> + emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
> + emit_insn (gen_insv_immdi (dest, GEN_INT (i),
> +GEN_INT ((val >> i) & 0x)));
> + i = (i + 16) & 63;
> + emit_insn (gen_insv_immdi (dest, GEN_INT (i),
> +GEN_INT ((val >> i) & 0x)));
> +   }
> +
> + return 3;
> +   }
> +}
> +
>/* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
>   are emitted by the initial mov.  If one_match > zero_match, skip set 
> bits,
>   otherwise skip zero bits.  */
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr106583.c 
> b/gcc/testsuite/gcc.target/aarch64/pr106583.c
> new file mode 100644
> index 
> ..f0a027a0950e506d4ddaacce5e151f57070948dc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr106583.c
> @@ -0,0 +1,30 @@
> +/* { dg-do assemble } */
> +/* { dg-options "-O2 --save-temps" } */
> +
> +long f1 (void)
> +{
> +  return 0x7efefefefefefeff;
> +}
> +
> +long f2 (void)
> +{
> +  return 0x12345678;
> +}
> +
> +long f3 (void)
> +{
> +  return 0x12345678;
> +}
> +
> +long f4 (void)
> +{
> +  return 0x12345678;
> +}
> +
> +long f5 (void)
> +{
> +  return 0x12345678;
> +}
> +
> +/* { dg-final { scan-assembler-times {\tmovk\t} 10 } } */
> +/* { dg-final { scan-assembler-times {\tmov\t} 5 } } */


[PATCH][AArch64] Improve immediate expansion [PR106583]

2022-10-04 Thread Wilco Dijkstra via Gcc-patches
Improve immediate expansion of immediates which can be created from a
bitmask immediate and 2 MOVKs.  This reduces the number of 4-instruction
immediates in SPECINT/FP by 10-15%.

Passes regress, OK for commit?

gcc/ChangeLog:

PR target/106583
* config/aarch64/aarch64.cc (aarch64_internal_mov_immediate)
Add support for a bitmask immediate with 2 MOVKs.

gcc/testsuite:
PR target/106583
* gcc.target/aarch64/pr106583.c: Add new test.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
926e81f028c82aac9a5fecc18f921f84399c24ae..1601d11710cb6132c80a77bb4fe2f8429519aa5a
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -5568,7 +5568,7 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool 
generate,
   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
 
-  if (zero_match != 2 && one_match != 2)
+  if (zero_match < 2 && one_match < 2)
 {
   /* Try emitting a bitmask immediate with a movk replacing 16 bits.
 For a 64-bit bitmask try whether changing 16 bits to all ones or
@@ -5600,6 +5600,43 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool 
generate,
}
 }
 
+  /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions.  */
+  if (zero_match + one_match == 0)
+{
+  mask = 0x;
+
+  for (i = 0; i < 64; i += 16)
+   {
+ val2 = val & ~mask;
+ if (aarch64_bitmask_imm (val2, mode))
+   break;
+ val2 = val | mask;
+ if (aarch64_bitmask_imm (val2, mode))
+   break;
+ val2 = val2 & ~mask;
+ val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
+ if (aarch64_bitmask_imm (val2, mode))
+   break;
+
+ mask = (mask << 16) | (mask >> 48);
+   }
+
+  if (i != 64)
+   {
+ if (generate)
+   {
+ emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
+ emit_insn (gen_insv_immdi (dest, GEN_INT (i),
+GEN_INT ((val >> i) & 0x)));
+ i = (i + 16) & 63;
+ emit_insn (gen_insv_immdi (dest, GEN_INT (i),
+GEN_INT ((val >> i) & 0x)));
+   }
+
+ return 3;
+   }
+}
+
   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
  are emitted by the initial mov.  If one_match > zero_match, skip set bits,
  otherwise skip zero bits.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr106583.c 
b/gcc/testsuite/gcc.target/aarch64/pr106583.c
new file mode 100644
index 
..f0a027a0950e506d4ddaacce5e151f57070948dc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr106583.c
@@ -0,0 +1,30 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 --save-temps" } */
+
+long f1 (void)
+{
+  return 0x7efefefefefefeff;
+}
+
+long f2 (void)
+{
+  return 0x12345678;
+}
+
+long f3 (void)
+{
+  return 0x12345678;
+}
+
+long f4 (void)
+{
+  return 0x12345678;
+}
+
+long f5 (void)
+{
+  return 0x12345678;
+}
+
+/* { dg-final { scan-assembler-times {\tmovk\t} 10 } } */
+/* { dg-final { scan-assembler-times {\tmov\t} 5 } } */