[PATCH v3] x86: Properly find the maximum stack slot alignment

2023-07-24 Thread H.J. Lu via Gcc-patches
Don't assume that stack slots can only be accessed by stack or frame
registers.  We first find all registers defined by stack or frame
registers.  Then check memory accesses by such registers, including
stack and frame registers.

gcc/

PR target/109780
* config/i386/i386.cc (ix86_update_stack_alignment): New.
(ix86_find_all_reg_use): Likewise.
(ix86_find_max_used_stack_alignment): Also check memory accesses
from registers defined by stack or frame registers.

gcc/testsuite/

PR target/109780
* g++.target/i386/pr109780-1.C: New test.
* gcc.target/i386/pr109780-1.c: Likewise.
* gcc.target/i386/pr109780-2.c: Likewise.
---
 gcc/config/i386/i386.cc| 128 +
 gcc/testsuite/g++.target/i386/pr109780-1.C |  72 
 gcc/testsuite/gcc.target/i386/pr109780-1.c |  14 +++
 gcc/testsuite/gcc.target/i386/pr109780-2.c |  21 
 4 files changed, 214 insertions(+), 21 deletions(-)
 create mode 100644 gcc/testsuite/g++.target/i386/pr109780-1.C
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109780-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109780-2.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index caca74d6dec..b71fd9401ef 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -8084,6 +8084,65 @@ output_probe_stack_range (rtx reg, rtx end)
   return "";
 }
 
+/* Update the maximum stack slot alignment from memory alignment in
+   PAT.  */
+
+static void
+ix86_update_stack_alignment (rtx, const_rtx pat, void *data)
+{
+  /* This insn may reference stack slot.  Update the maximum stack slot
+ alignment.  */
+  subrtx_iterator::array_type array;
+  FOR_EACH_SUBRTX (iter, array, pat, ALL)
+if (MEM_P (*iter))
+  {
+   unsigned int alignment = MEM_ALIGN (*iter);
+   unsigned int *stack_alignment
+ = (unsigned int *) data;
+   if (alignment > *stack_alignment)
+ *stack_alignment = alignment;
+   break;
+  }
+}
+
+/* Find all registers defined with REG.  */
+
+static void
+ix86_find_all_reg_use (HARD_REG_SET _slot_access,
+  unsigned int reg, auto_bitmap )
+{
+  for (df_ref ref = DF_REG_USE_CHAIN (reg);
+   ref != NULL;
+   ref = DF_REF_NEXT_REG (ref))
+{
+  if (DF_REF_IS_ARTIFICIAL (ref))
+   continue;
+
+  rtx_insn *insn = DF_REF_INSN (ref);
+  if (!NONDEBUG_INSN_P (insn))
+   continue;
+
+  rtx set = single_set (insn);
+  if (!set)
+   continue;
+
+  rtx src = SET_SRC (set);
+  if (MEM_P (src))
+   continue;
+
+  rtx dest = SET_DEST (set);
+  if (!REG_P (dest))
+   continue;
+
+  if (TEST_HARD_REG_BIT (stack_slot_access, REGNO (dest)))
+   continue;
+
+  /* Add this register to stack_slot_access.  */
+  add_to_hard_reg_set (_slot_access, Pmode, REGNO (dest));
+  bitmap_set_bit (worklist, REGNO (dest));
+}
+}
+
 /* Set stack_frame_required to false if stack frame isn't required.
Update STACK_ALIGNMENT to the largest alignment, in bits, of stack
slot used if stack frame is required and CHECK_STACK_SLOT is true.  */
@@ -8102,10 +8161,6 @@ ix86_find_max_used_stack_alignment (unsigned int 
_alignment,
   add_to_hard_reg_set (_up_by_prologue, Pmode,
   HARD_FRAME_POINTER_REGNUM);
 
-  /* The preferred stack alignment is the minimum stack alignment.  */
-  if (stack_alignment > crtl->preferred_stack_boundary)
-stack_alignment = crtl->preferred_stack_boundary;
-
   bool require_stack_frame = false;
 
   FOR_EACH_BB_FN (bb, cfun)
@@ -8117,27 +8172,58 @@ ix86_find_max_used_stack_alignment (unsigned int 
_alignment,
   set_up_by_prologue))
  {
require_stack_frame = true;
-
-   if (check_stack_slot)
- {
-   /* Find the maximum stack alignment.  */
-   subrtx_iterator::array_type array;
-   FOR_EACH_SUBRTX (iter, array, PATTERN (insn), ALL)
- if (MEM_P (*iter)
- && (reg_mentioned_p (stack_pointer_rtx,
-  *iter)
- || reg_mentioned_p (frame_pointer_rtx,
- *iter)))
-   {
- unsigned int alignment = MEM_ALIGN (*iter);
- if (alignment > stack_alignment)
-   stack_alignment = alignment;
-   }
- }
+   break;
  }
 }
 
   cfun->machine->stack_frame_required = require_stack_frame;
+
+  /* Stop if we don't need to check stack slot.  */
+  if (!check_stack_slot)
+return;
+
+  /* The preferred stack alignment is the minimum stack alignment.  */
+  if (stack_alignment > crtl->preferred_stack_boundary)
+stack_alignment = crtl->preferred_stack_boundary;
+
+  HARD_REG_SET stack_slot_access;
+  CLEAR_HARD_REG_SET 

Re: [PATCH v2] x86: Properly find the maximum stack slot alignment

2023-07-24 Thread H.J. Lu via Gcc-patches
On Mon, Jul 10, 2023 at 3:32 AM Richard Biener
 wrote:
>
> On Fri, Jul 7, 2023 at 5:14 PM H.J. Lu via Gcc-patches
>  wrote:
> >
> > Don't assume that stack slots can only be accessed by stack or frame
> > registers.  We first find all registers defined by stack or frame
> > registers.  Then check memory accesses by such registers, including
> > stack and frame registers.
> >
> > gcc/
> >
> > PR target/109780
> > * config/i386/i386.cc (ix86_update_stack_alignment): New.
> > (ix86_find_all_reg_use): Likewise.
> > (ix86_find_max_used_stack_alignment): Also check memory accesses
> > from registers defined by stack or frame registers.
> >
> > gcc/testsuite/
> >
> > PR target/109780
> > * g++.target/i386/pr109780-1.C: New test.
> > * gcc.target/i386/pr109780-1.c: Likewise.
> > * gcc.target/i386/pr109780-2.c: Likewise.
> > ---
> >  gcc/config/i386/i386.cc| 120 +
> >  gcc/testsuite/g++.target/i386/pr109780-1.C |  72 +
> >  gcc/testsuite/gcc.target/i386/pr109780-1.c |  14 +++
> >  gcc/testsuite/gcc.target/i386/pr109780-2.c |  21 
> >  4 files changed, 206 insertions(+), 21 deletions(-)
> >  create mode 100644 gcc/testsuite/g++.target/i386/pr109780-1.C
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr109780-1.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr109780-2.c
> >
> > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> > index caca74d6dec..27f349b0ccb 100644
> > --- a/gcc/config/i386/i386.cc
> > +++ b/gcc/config/i386/i386.cc
> > @@ -8084,6 +8084,63 @@ output_probe_stack_range (rtx reg, rtx end)
> >return "";
> >  }
> >
> > +/* Update the maximum stack slot alignment from memory alignment in
> > +   PAT.  */
> > +
> > +static void
> > +ix86_update_stack_alignment (rtx, const_rtx pat, void *data)
> > +{
> > +  /* This insn may reference stack slot.  Update the maximum stack slot
> > + alignment.  */
> > +  subrtx_iterator::array_type array;
> > +  FOR_EACH_SUBRTX (iter, array, pat, ALL)
> > +if (MEM_P (*iter))
> > +  {
> > +   unsigned int alignment = MEM_ALIGN (*iter);
> > +   unsigned int *stack_alignment
> > + = (unsigned int *) data;
> > +   if (alignment > *stack_alignment)
> > + *stack_alignment = alignment;
> > +   break;
> > +  }
> > +}
> > +
> > +/* Find all registers defined with REG.  */
> > +
> > +static void
> > +ix86_find_all_reg_use (HARD_REG_SET _slot_access, int reg)
> > +{
> > +  for (df_ref ref = DF_REG_USE_CHAIN (reg);
> > +   ref != NULL;
> > +   ref = DF_REF_NEXT_REG (ref))
> > +{
> > +  if (DF_REF_IS_ARTIFICIAL (ref))
> > +   continue;
> > +
> > +  rtx_insn *insn = DF_REF_INSN (ref);
> > +  if (!NONDEBUG_INSN_P (insn))
> > +   continue;
> > +
> > +  rtx set = single_set (insn);
> > +  if (!set)
> > +   continue;
> > +
> > +  rtx src = SET_SRC (set);
> > +  if (MEM_P (src))
> > +   continue;
> > +
> > +  rtx dest = SET_DEST (set);
> > +  if (!REG_P (dest))
> > +   continue;
> > +
> > +  if (TEST_HARD_REG_BIT (stack_slot_access, REGNO (dest)))
> > +   continue;
> > +
> > +  /* Add this register to stack_slot_access.  */
> > +  add_to_hard_reg_set (_slot_access, Pmode, REGNO (dest));
> > +}
> > +}
> > +
> >  /* Set stack_frame_required to false if stack frame isn't required.
> > Update STACK_ALIGNMENT to the largest alignment, in bits, of stack
> > slot used if stack frame is required and CHECK_STACK_SLOT is true.  */
> > @@ -8102,10 +8159,6 @@ ix86_find_max_used_stack_alignment (unsigned int 
> > _alignment,
> >add_to_hard_reg_set (_up_by_prologue, Pmode,
> >HARD_FRAME_POINTER_REGNUM);
> >
> > -  /* The preferred stack alignment is the minimum stack alignment.  */
> > -  if (stack_alignment > crtl->preferred_stack_boundary)
> > -stack_alignment = crtl->preferred_stack_boundary;
> > -
> >bool require_stack_frame = false;
> >
> >FOR_EACH_BB_FN (bb, cfun)
> > @@ -8117,27 +8170,52 @@ ix86_find_max_used_stack_alignment (unsigned int 
> > _alignment,
> >set_up_by_prologue))
> >   {
> >

[PATCH v2] x86: Properly find the maximum stack slot alignment

2023-07-07 Thread H.J. Lu via Gcc-patches
Don't assume that stack slots can only be accessed by stack or frame
registers.  We first find all registers defined by stack or frame
registers.  Then check memory accesses by such registers, including
stack and frame registers.

gcc/

PR target/109780
* config/i386/i386.cc (ix86_update_stack_alignment): New.
(ix86_find_all_reg_use): Likewise.
(ix86_find_max_used_stack_alignment): Also check memory accesses
from registers defined by stack or frame registers.

gcc/testsuite/

PR target/109780
* g++.target/i386/pr109780-1.C: New test.
* gcc.target/i386/pr109780-1.c: Likewise.
* gcc.target/i386/pr109780-2.c: Likewise.
---
 gcc/config/i386/i386.cc| 120 +
 gcc/testsuite/g++.target/i386/pr109780-1.C |  72 +
 gcc/testsuite/gcc.target/i386/pr109780-1.c |  14 +++
 gcc/testsuite/gcc.target/i386/pr109780-2.c |  21 
 4 files changed, 206 insertions(+), 21 deletions(-)
 create mode 100644 gcc/testsuite/g++.target/i386/pr109780-1.C
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109780-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109780-2.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index caca74d6dec..27f349b0ccb 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -8084,6 +8084,63 @@ output_probe_stack_range (rtx reg, rtx end)
   return "";
 }
 
+/* Update the maximum stack slot alignment from memory alignment in
+   PAT.  */
+
+static void
+ix86_update_stack_alignment (rtx, const_rtx pat, void *data)
+{
+  /* This insn may reference stack slot.  Update the maximum stack slot
+ alignment.  */
+  subrtx_iterator::array_type array;
+  FOR_EACH_SUBRTX (iter, array, pat, ALL)
+if (MEM_P (*iter))
+  {
+   unsigned int alignment = MEM_ALIGN (*iter);
+   unsigned int *stack_alignment
+ = (unsigned int *) data;
+   if (alignment > *stack_alignment)
+ *stack_alignment = alignment;
+   break;
+  }
+}
+
+/* Find all registers defined with REG.  */
+
+static void
+ix86_find_all_reg_use (HARD_REG_SET _slot_access, int reg)
+{
+  for (df_ref ref = DF_REG_USE_CHAIN (reg);
+   ref != NULL;
+   ref = DF_REF_NEXT_REG (ref))
+{
+  if (DF_REF_IS_ARTIFICIAL (ref))
+   continue;
+
+  rtx_insn *insn = DF_REF_INSN (ref);
+  if (!NONDEBUG_INSN_P (insn))
+   continue;
+
+  rtx set = single_set (insn);
+  if (!set)
+   continue;
+
+  rtx src = SET_SRC (set);
+  if (MEM_P (src))
+   continue;
+
+  rtx dest = SET_DEST (set);
+  if (!REG_P (dest))
+   continue;
+
+  if (TEST_HARD_REG_BIT (stack_slot_access, REGNO (dest)))
+   continue;
+
+  /* Add this register to stack_slot_access.  */
+  add_to_hard_reg_set (_slot_access, Pmode, REGNO (dest));
+}
+}
+
 /* Set stack_frame_required to false if stack frame isn't required.
Update STACK_ALIGNMENT to the largest alignment, in bits, of stack
slot used if stack frame is required and CHECK_STACK_SLOT is true.  */
@@ -8102,10 +8159,6 @@ ix86_find_max_used_stack_alignment (unsigned int 
_alignment,
   add_to_hard_reg_set (_up_by_prologue, Pmode,
   HARD_FRAME_POINTER_REGNUM);
 
-  /* The preferred stack alignment is the minimum stack alignment.  */
-  if (stack_alignment > crtl->preferred_stack_boundary)
-stack_alignment = crtl->preferred_stack_boundary;
-
   bool require_stack_frame = false;
 
   FOR_EACH_BB_FN (bb, cfun)
@@ -8117,27 +8170,52 @@ ix86_find_max_used_stack_alignment (unsigned int 
_alignment,
   set_up_by_prologue))
  {
require_stack_frame = true;
-
-   if (check_stack_slot)
- {
-   /* Find the maximum stack alignment.  */
-   subrtx_iterator::array_type array;
-   FOR_EACH_SUBRTX (iter, array, PATTERN (insn), ALL)
- if (MEM_P (*iter)
- && (reg_mentioned_p (stack_pointer_rtx,
-  *iter)
- || reg_mentioned_p (frame_pointer_rtx,
- *iter)))
-   {
- unsigned int alignment = MEM_ALIGN (*iter);
- if (alignment > stack_alignment)
-   stack_alignment = alignment;
-   }
- }
+   break;
  }
 }
 
   cfun->machine->stack_frame_required = require_stack_frame;
+
+  /* Stop if we don't need to check stack slot.  */
+  if (!check_stack_slot)
+return;
+
+  /* The preferred stack alignment is the minimum stack alignment.  */
+  if (stack_alignment > crtl->preferred_stack_boundary)
+stack_alignment = crtl->preferred_stack_boundary;
+
+  HARD_REG_SET stack_slot_access;
+  CLEAR_HARD_REG_SET (stack_slot_access);
+
+  /* Stack slot can be accessed by stack pointer, frame pointer or
+ 

[PATCH] x86: Properly find the maximum stack slot alignment

2023-07-05 Thread H.J. Lu via Gcc-patches
Don't assume that stack slots can only be accessed by stack or frame
registers.  Also check memory accesses from registers defined by
stack or frame registers.

gcc/

PR target/109780
* config/i386/i386.cc (ix86_set_with_register_source): New.
(ix86_find_all_stack_access): Likewise.
(ix86_find_max_used_stack_alignment): Also check memory accesses
from registers defined by stack or frame registers.

gcc/testsuite/

PR target/109780
* g++.target/i386/pr109780-1.C: New test.
* gcc.target/i386/pr109780-1.c: Likewise.
* gcc.target/i386/pr109780-2.c: Likewise.
---
 gcc/config/i386/i386.cc| 145 ++---
 gcc/testsuite/g++.target/i386/pr109780-1.C |  72 ++
 gcc/testsuite/gcc.target/i386/pr109780-1.c |  14 ++
 gcc/testsuite/gcc.target/i386/pr109780-2.c |  21 +++
 4 files changed, 233 insertions(+), 19 deletions(-)
 create mode 100644 gcc/testsuite/g++.target/i386/pr109780-1.C
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109780-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109780-2.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index caca74d6dec..85dd8cb0581 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -8084,6 +8084,72 @@ output_probe_stack_range (rtx reg, rtx end)
   return "";
 }
 
+/* Check if PAT is a SET with register source.  */
+
+static void
+ix86_set_with_register_source (rtx, const_rtx pat, void *data)
+{
+  if (GET_CODE (pat) != SET)
+return;
+
+  rtx src = SET_SRC (pat);
+  if (MEM_P (src) || CONST_INT_P (src))
+return;
+
+  bool *may_use_register = (bool *) data;
+  *may_use_register = true;
+}
+
+/* Find all register access registers.  */
+
+static bool
+ix86_find_all_stack_access (HARD_REG_SET _slot_access)
+{
+  bool repeat = false;
+
+  for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+if (GENERAL_REGNO_P (i)
+   && !TEST_HARD_REG_BIT (stack_slot_access, i))
+  for (df_ref def = DF_REG_DEF_CHAIN (i);
+  def != NULL;
+  def = DF_REF_NEXT_REG (def))
+   {
+ if (DF_REF_IS_ARTIFICIAL (def))
+   continue;
+
+ rtx_insn *insn = DF_REF_INSN (def);
+
+ bool may_use_register = false;
+ note_stores (insn, ix86_set_with_register_source,
+  _use_register);
+
+ if (!may_use_register)
+   continue;
+
+ df_ref use;
+ FOR_EACH_INSN_USE (use, insn)
+   {
+ rtx reg = DF_REF_REG (use);
+
+ if (!REG_P (reg))
+   continue;
+
+ /* Skip if stack slot access register isn't used.  */
+ if (!TEST_HARD_REG_BIT (stack_slot_access,
+ REGNO (reg)))
+   continue;
+
+ /* Add this register to stack_slot_access.  */
+ add_to_hard_reg_set (_slot_access, Pmode, i);
+
+ /* Repeat if a register is added to stack_slot_access.  */
+ repeat = true;
+   }
+   }
+
+  return repeat;
+}
+
 /* Set stack_frame_required to false if stack frame isn't required.
Update STACK_ALIGNMENT to the largest alignment, in bits, of stack
slot used if stack frame is required and CHECK_STACK_SLOT is true.  */
@@ -8092,15 +8158,23 @@ static void
 ix86_find_max_used_stack_alignment (unsigned int _alignment,
bool check_stack_slot)
 {
-  HARD_REG_SET set_up_by_prologue, prologue_used;
+  HARD_REG_SET set_up_by_prologue, prologue_used, stack_slot_access;
   basic_block bb;
 
   CLEAR_HARD_REG_SET (prologue_used);
   CLEAR_HARD_REG_SET (set_up_by_prologue);
+  CLEAR_HARD_REG_SET (stack_slot_access);
   add_to_hard_reg_set (_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
   add_to_hard_reg_set (_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
   add_to_hard_reg_set (_up_by_prologue, Pmode,
   HARD_FRAME_POINTER_REGNUM);
+  /* Stack slot can be accessed by stack pointer, frame pointer or
+ registers defined by stack pointer or frame pointer.  */
+  add_to_hard_reg_set (_slot_access, Pmode,
+  STACK_POINTER_REGNUM);
+  if (frame_pointer_needed)
+add_to_hard_reg_set (_slot_access, Pmode,
+HARD_FRAME_POINTER_REGNUM);
 
   /* The preferred stack alignment is the minimum stack alignment.  */
   if (stack_alignment > crtl->preferred_stack_boundary)
@@ -8108,32 +8182,65 @@ ix86_find_max_used_stack_alignment (unsigned int 
_alignment,
 
   bool require_stack_frame = false;
 
+  /* Find all register access registers.  */
+  while (ix86_find_all_stack_access (stack_slot_access))
+;
+
   FOR_EACH_BB_FN (bb, cfun)
 {
   rtx_insn *insn;
   FOR_BB_INSNS (bb, insn)
-   if (NONDEBUG_INSN_P (insn)
-   && requires_stack_frame_p (insn, prologue_used,
-  set_up_by_prologue))
+   if (NONDEBUG_INSN_P (insn))
  {
-   

Re: [PATCH] i386: Honour -mdirect-extern-access when calling __fentry__

2023-05-10 Thread H.J. Lu via Gcc-patches
On Wed, May 10, 2023 at 2:17 AM Uros Bizjak  wrote:
>
> On Tue, May 9, 2023 at 10:58 AM Ard Biesheuvel  wrote:
> >
> > The small and medium PIC code models generate profiling calls that
> > always load the address of __fentry__() via the GOT, even if
> > -mdirect-extern-access is in effect.
> >
> > This deviates from the behavior with respect to other external
> > references, and results in a longer opcode that relies on linker
> > relaxation to eliminate the GOT load. In this particular case, the
> > transformation replaces an indirect 'CALL *__fentry__@GOTPCREL(%rip)'
> > with either 'CALL __fentry__; NOP' or 'NOP; CALL __fentry__', where the
> > NOP is a 1 byte NOP that preserves the 6 byte length of the sequence.
> >
> > This is problematic for the Linux kernel, which generally relies on
> > -mdirect-extern-access and hidden visibility to eliminate GOT based
> > symbol references in code generated with -fpie/-fpic, without having to
> > depend on linker relaxation.
> >
> > The Linux kernel relies on code patching to replace these opcodes with
> > NOPs at runtime, and this is complicated code that we'd prefer not to
> > complicate even more by adding support for patching both 5 and 6 byte
> > sequences as well as parsing the instruction stream to decide which
> > variant of CALL+NOP we are dealing with.
> >
> > So let's honour -mdirect-extern-access, and only load the address of
> > __fentry__ via the GOT if direct references to external symbols are not
> > permitted.
> >
> > Note that the GOT reference in question is in fact a data reference: we
> > explicitly load the address of __fentry__ from the GOT, which amounts to
> > eager binding, rather than emitting a PLT call that could bind eagerly,
> > lazily or directly at link time.
> >
> > gcc/ChangeLog:
> >
> > * config/i386/i386.cc (x86_function_profiler): Take
> >   ix86_direct_extern_access into account when generating calls
> >   to __fentry__()
>
> HJ, is the patch OK with you?

LGTM.

Thanks.

> Uros.
>
> >
> > Cc: H.J. Lu 
> > Cc: Jakub Jelinek 
> > Cc: Richard Biener 
> > Cc: Uros Bizjak 
> > Cc: Hou Wenlong 
> > ---
> >  gcc/config/i386/i386.cc | 8 ++--
> >  1 file changed, 6 insertions(+), 2 deletions(-)
> >
> > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> > index b1d08ecdb3d44729..69b183abb4318b0a 100644
> > --- a/gcc/config/i386/i386.cc
> > +++ b/gcc/config/i386/i386.cc
> > @@ -21836,8 +21836,12 @@ x86_function_profiler (FILE *file, int labelno 
> > ATTRIBUTE_UNUSED)
> >   break;
> > case CM_SMALL_PIC:
> > case CM_MEDIUM_PIC:
> > - fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", 
> > mcount_name);
> > - break;
> > + if (!ix86_direct_extern_access)
> > +   {
> > + fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", 
> > mcount_name);
> > + break;
> > +   }
> > + /* fall through */
> > default:
> >   x86_print_call_or_nop (file, mcount_name);
> >   break;
> > --
> > 2.39.2
> >



-- 
H.J.


Re: [PATCH] libsanitizer: cherry-pick commit 05551c658269 from upstream

2023-04-27 Thread H.J. Lu via Gcc-patches
On Thu, Apr 27, 2023 at 12:03 AM Martin Liška  wrote:
>
> On 4/27/23 04:32, H.J. Lu via Gcc-patches wrote:
> > cherry-pick:
>
> Can you please wait a few days before it? I'm going to merge again
> in the near future after https://reviews.llvm.org/D144073 got handled.

Sure.

> Martin
>
> >
> > 05551c658269 [sanitizer] Correct alignment of x32 __sanitizer_siginfo
> >
> >   * sanitizer_common/sanitizer_platform_limits_posix.h
> >   (__sanitizer_siginfo_pad): Use u64 to align x32
> >   __sanitizer_siginfo to 8 bytes.
> > ---
> >  .../sanitizer_common/sanitizer_platform_limits_posix.h   | 5 +
> >  1 file changed, 5 insertions(+)
> >
> > diff --git 
> > a/libsanitizer/sanitizer_common/sanitizer_platform_limits_posix.h 
> > b/libsanitizer/sanitizer_common/sanitizer_platform_limits_posix.h
> > index cfca7bdedbe..e6f298c26e1 100644
> > --- a/libsanitizer/sanitizer_common/sanitizer_platform_limits_posix.h
> > +++ b/libsanitizer/sanitizer_common/sanitizer_platform_limits_posix.h
> > @@ -578,8 +578,13 @@ struct __sanitizer_sigset_t {
> >  #endif
> >
> >  struct __sanitizer_siginfo_pad {
> > +#if SANITIZER_X32
> > +  // x32 siginfo_t is aligned to 8 bytes.
> > +  u64 pad[128 / sizeof(u64)];
> > +#else
> >// Require uptr, because siginfo_t is always pointer-size aligned on 
> > Linux.
> >uptr pad[128 / sizeof(uptr)];
> > +#endif
> >  };
> >
> >  #if SANITIZER_LINUX
>


-- 
H.J.


[PATCH] libsanitizer: cherry-pick commit 05551c658269 from upstream

2023-04-26 Thread H.J. Lu via Gcc-patches
cherry-pick:

05551c658269 [sanitizer] Correct alignment of x32 __sanitizer_siginfo

* sanitizer_common/sanitizer_platform_limits_posix.h
(__sanitizer_siginfo_pad): Use u64 to align x32
__sanitizer_siginfo to 8 bytes.
---
 .../sanitizer_common/sanitizer_platform_limits_posix.h   | 5 +
 1 file changed, 5 insertions(+)

diff --git a/libsanitizer/sanitizer_common/sanitizer_platform_limits_posix.h 
b/libsanitizer/sanitizer_common/sanitizer_platform_limits_posix.h
index cfca7bdedbe..e6f298c26e1 100644
--- a/libsanitizer/sanitizer_common/sanitizer_platform_limits_posix.h
+++ b/libsanitizer/sanitizer_common/sanitizer_platform_limits_posix.h
@@ -578,8 +578,13 @@ struct __sanitizer_sigset_t {
 #endif
 
 struct __sanitizer_siginfo_pad {
+#if SANITIZER_X32
+  // x32 siginfo_t is aligned to 8 bytes.
+  u64 pad[128 / sizeof(u64)];
+#else
   // Require uptr, because siginfo_t is always pointer-size aligned on Linux.
   uptr pad[128 / sizeof(uptr)];
+#endif
 };
 
 #if SANITIZER_LINUX
-- 
2.40.0



Re: libsanitizer: sync from master

2023-04-26 Thread H.J. Lu via Gcc-patches
On Wed, Apr 26, 2023 at 4:37 PM H.J. Lu  wrote:
>
> On Wed, Apr 26, 2023 at 1:24 PM Martin Liška  wrote:
> >
> > On 4/26/23 21:23, H.J. Lu wrote:
> > > On Wed, Apr 26, 2023 at 6:52 AM Martin Liška  wrote:
> > >>
> > >> On 11/15/22 16:47, Martin Liška wrote:
> > >>> Hi.
> > >>>
> > >>> I've just pushed libsanitizer update that was tested on x86_64-linux 
> > >>> and ppc64le-linux systems.
> > >>> Moreover, I run bootstrap on x86_64-linux and checked ABI difference 
> > >>> with abidiff.
> > >>
> > >> Hello.
> > >>
> > >> And I've done the same now and merged upstream version 
> > >> 3185e47b5a8444e9fd.
> > >
> > > It caused the bootstrap failure:
> > >
> > > https://gcc.gnu.org/pipermail/gcc-regression/2023-April/077674.html
> >
> > Can you see what's the build error in the build log? I can't see it from the
> > sent link?
>
> I opened:
>
> https://github.com/llvm/llvm-project/issues/62394
>
> and will submit a patch upstream.
>

Fixed in upstream by

https://reviews.llvm.org/D142995

-- 
H.J.


Re: libsanitizer: sync from master

2023-04-26 Thread H.J. Lu via Gcc-patches
On Wed, Apr 26, 2023 at 1:24 PM Martin Liška  wrote:
>
> On 4/26/23 21:23, H.J. Lu wrote:
> > On Wed, Apr 26, 2023 at 6:52 AM Martin Liška  wrote:
> >>
> >> On 11/15/22 16:47, Martin Liška wrote:
> >>> Hi.
> >>>
> >>> I've just pushed libsanitizer update that was tested on x86_64-linux and 
> >>> ppc64le-linux systems.
> >>> Moreover, I run bootstrap on x86_64-linux and checked ABI difference with 
> >>> abidiff.
> >>
> >> Hello.
> >>
> >> And I've done the same now and merged upstream version 3185e47b5a8444e9fd.
> >
> > It caused the bootstrap failure:
> >
> > https://gcc.gnu.org/pipermail/gcc-regression/2023-April/077674.html
>
> Can you see what's the build error in the build log? I can't see it from the
> sent link?

I opened:

https://github.com/llvm/llvm-project/issues/62394

and will submit a patch upstream.

> Martin
>
> >
> >> Martin
> >>
> >>>
> >>> Pushed as r13-4068-g3037f11fb86eda.
> >>>
> >>> Cheers,
> >>> Martin
> >>
> >
> >
>


-- 
H.J.


Re: libsanitizer: sync from master

2023-04-26 Thread H.J. Lu via Gcc-patches
On Wed, Apr 26, 2023 at 6:52 AM Martin Liška  wrote:
>
> On 11/15/22 16:47, Martin Liška wrote:
> > Hi.
> >
> > I've just pushed libsanitizer update that was tested on x86_64-linux and 
> > ppc64le-linux systems.
> > Moreover, I run bootstrap on x86_64-linux and checked ABI difference with 
> > abidiff.
>
> Hello.
>
> And I've done the same now and merged upstream version 3185e47b5a8444e9fd.

It caused the bootstrap failure:

https://gcc.gnu.org/pipermail/gcc-regression/2023-April/077674.html

> Martin
>
> >
> > Pushed as r13-4068-g3037f11fb86eda.
> >
> > Cheers,
> > Martin
>


-- 
H.J.


Re: [PATCH] Remove TARGET_GEN_MEMSET_SCRATCH_RTX since it's not used anymore.

2023-03-22 Thread H.J. Lu via Gcc-patches
On Wed, Mar 22, 2023 at 3:19 AM Richard Biener
 wrote:
>
> On Wed, Mar 22, 2023 at 8:07 AM Uros Bizjak  wrote:
> >
> > On Wed, Mar 22, 2023 at 3:59 AM liuhongt  wrote:
> > >
> > > The target hook is only used by i386, and the current definition is
> > > same as default gen_reg_rtx. So there's no need for this target hook.
> > >
> > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > > Ok for trunk(or GCC14)?
> > >
> > > gcc/ChangeLog:
> > >
> > > * builtins.cc (builtin_memset_read_str): Replace
> > > targetm.gen_memset_scratch_rtx with gen_reg_rtx.
> > > (builtin_memset_gen_str): Ditto.
> > > * config/i386/i386-expand.cc
> > > (ix86_convert_const_wide_int_to_broadcast): Replace
> > > ix86_gen_scratch_sse_rtx with gen_reg_rtx.
> > > (ix86_expand_vector_move): Ditto.
> > > * config/i386/i386-protos.h (ix86_gen_scratch_sse_rtx):
> > > Removed.
> > > * config/i386/i386.cc (ix86_gen_scratch_sse_rtx): Removed.
> > > (TARGET_GEN_MEMSET_SCRATCH_RTX): Removed.
> > > * doc/tm.texi: Remove TARGET_GEN_MEMSET_SCRATCH_RTX.
> > > * doc/tm.texi.in: Ditto.
> > > * target.def: Ditto.
> >
> > Looks trivial enough for gcc13, so OK for x86 part.
> >
> > Needs also OK from a middle-end reviewer.
>
> Is/was the code ever exercised for non-x86?  HJ, what was the reason to
> abstract this?
>
> OK if HJ thinks it was really unnecessary abstraction unlikely to be
> required by another target.

OK with me.

Thanks.

> Richard.
>
> > Thanks,
> > Uros.
> >
> > > ---
> > >  gcc/builtins.cc|  4 ++--
> > >  gcc/config/i386/i386-expand.cc |  6 +++---
> > >  gcc/config/i386/i386-protos.h  |  2 --
> > >  gcc/config/i386/i386.cc| 12 
> > >  gcc/doc/tm.texi|  7 ---
> > >  gcc/doc/tm.texi.in |  2 --
> > >  gcc/target.def |  9 -
> > >  7 files changed, 5 insertions(+), 37 deletions(-)
> > >
> > > diff --git a/gcc/builtins.cc b/gcc/builtins.cc
> > > index 90246e214d6..8026e2001b7 100644
> > > --- a/gcc/builtins.cc
> > > +++ b/gcc/builtins.cc
> > > @@ -4212,7 +4212,7 @@ builtin_memset_read_str (void *data, void *prev,
> > > return const_vec;
> > >
> > >/* Use the move expander with CONST_VECTOR.  */
> > > -  target = targetm.gen_memset_scratch_rtx (mode);
> > > +  target = gen_reg_rtx (mode);
> > >emit_move_insn (target, const_vec);
> > >return target;
> > >  }
> > > @@ -4256,7 +4256,7 @@ builtin_memset_gen_str (void *data, void *prev,
> > >  the memset expander.  */
> > >insn_code icode = optab_handler (vec_duplicate_optab, mode);
> > >
> > > -  target = targetm.gen_memset_scratch_rtx (mode);
> > > +  target = gen_reg_rtx (mode);
> > >class expand_operand ops[2];
> > >create_output_operand ([0], target, mode);
> > >create_input_operand ([1], (rtx) data, QImode);
> > > diff --git a/gcc/config/i386/i386-expand.cc 
> > > b/gcc/config/i386/i386-expand.cc
> > > index c1300dc4e26..1e3ce4b7c3f 100644
> > > --- a/gcc/config/i386/i386-expand.cc
> > > +++ b/gcc/config/i386/i386-expand.cc
> > > @@ -338,7 +338,7 @@ ix86_convert_const_wide_int_to_broadcast 
> > > (machine_mode mode, rtx op)
> > >machine_mode vector_mode;
> > >if (!mode_for_vector (broadcast_mode, nunits).exists (_mode))
> > >  gcc_unreachable ();
> > > -  rtx target = ix86_gen_scratch_sse_rtx (vector_mode);
> > > +  rtx target = gen_reg_rtx (vector_mode);
> > >bool ok = ix86_expand_vector_init_duplicate (false, vector_mode,
> > >target,
> > >GEN_INT (val_broadcast));
> > > @@ -686,7 +686,7 @@ ix86_expand_vector_move (machine_mode mode, rtx 
> > > operands[])
> > >if (!register_operand (op0, mode)
> > >   && !register_operand (op1, mode))
> > > {
> > > - rtx scratch = ix86_gen_scratch_sse_rtx (mode);
> > > + rtx scratch = gen_reg_rtx (mode);
> > >   emit_move_insn (scratch, op1);
> > >   op1 = scratch;
> > > }
> > > @@ -728,7 +728,7 @@ ix86_expand_vector_move (machine_mode mode, rtx 
> > > operands[])
> > >&& !register_operand (op0, mode)
> > >&& !register_operand (op1, mode))
> > >  {
> > > -  rtx tmp = ix86_gen_scratch_sse_rtx (GET_MODE (op0));
> > > +  rtx tmp = gen_reg_rtx (GET_MODE (op0));
> > >emit_move_insn (tmp, op1);
> > >emit_move_insn (op0, tmp);
> > >return;
> > > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
> > > index bfb2198265a..71ae95ffef7 100644
> > > --- a/gcc/config/i386/i386-protos.h
> > > +++ b/gcc/config/i386/i386-protos.h
> > > @@ -50,8 +50,6 @@ extern void ix86_reset_previous_fndecl (void);
> > >
> > >  extern bool ix86_using_red_zone (void);
> > >
> > > -extern rtx ix86_gen_scratch_sse_rtx (machine_mode);
> > > 

Re: [PATCH] i386: Call get_available_features for all CPUs with max_level >= 1 [PR100758]

2023-02-09 Thread H.J. Lu via Gcc-patches
On Thu, Feb 9, 2023 at 4:12 AM Jakub Jelinek  wrote:
>
> Hi!
>
> get_available_features doesn't depend on cpu_model2->__cpu_{family,model}
> and just sets stuff up based on CPUID leaf 1, or some extended ones,
> so I wonder why are we calling it separately for Intel, AMD and Zhaoxin
> and not for all other CPUs too?  I think various programs in the wild
> which aren't using __builtin_cpu_{is,supports} just check the various CPUID
> leafs and query bits in there, without blacklisting unknown CPU vendors,
> so I think even __builtin_cpu_supports ("sse2") etc. should be reliable
> if those VENDOR_{CENTAUR,CYRIX,NSC,OTHER} CPUs set those bits in CPUID leaf
> 1 or some extended ones.  Calling it for all CPUs also means it can be
> inlined because there will be just a single caller.
>
> I will test on Intel but can't test it on non-Intel (or with some extra
> effort on AMD; for both of those arches it should be really no change in
> behavior).
>
> Thoughts on this?

No objection here.   It just isn't easy to verify CPUID behavior on
other processors.

Thanks.

> 2023-02-09  Jakub Jelinek  
>
> PR target/100758
> * common/config/i386/cpuinfo.h (get_zhaoxin_cpu): Formatting fixes.
> (cpu_indicator_init): Call get_available_features for all CPUs with
> max_level >= 1, rather than just Intel, AMD or Zhaoxin.  Formatting
> fixes.
>
> --- gcc/common/config/i386/cpuinfo.h.jj 2023-01-16 11:52:15.910736614 +0100
> +++ gcc/common/config/i386/cpuinfo.h2023-02-09 12:51:23.539470140 +0100
> @@ -601,8 +601,8 @@ get_intel_cpu (struct __processor_model
>
>  static inline const char *
>  get_zhaoxin_cpu (struct __processor_model *cpu_model,
> -   struct __processor_model2 *cpu_model2,
> -   unsigned int *cpu_features2)
> +struct __processor_model2 *cpu_model2,
> +unsigned int *cpu_features2)
>  {
>const char *cpu = NULL;
>unsigned int family = cpu_model2->__cpu_family;
> @@ -1016,6 +1016,10 @@ cpu_indicator_init (struct __processor_m
>extended_model = (eax >> 12) & 0xf0;
>extended_family = (eax >> 20) & 0xff;
>
> +  /* Find available features. */
> +  get_available_features (cpu_model, cpu_model2, cpu_features2,
> + ecx, edx);
> +
>if (vendor == signature_INTEL_ebx)
>  {
>/* Adjust model and family for Intel CPUS. */
> @@ -1030,9 +1034,6 @@ cpu_indicator_init (struct __processor_m
>cpu_model2->__cpu_family = family;
>cpu_model2->__cpu_model = model;
>
> -  /* Find available features. */
> -  get_available_features (cpu_model, cpu_model2, cpu_features2,
> - ecx, edx);
>/* Get CPU type.  */
>get_intel_cpu (cpu_model, cpu_model2, cpu_features2);
>cpu_model->__cpu_vendor = VENDOR_INTEL;
> @@ -1049,9 +1050,6 @@ cpu_indicator_init (struct __processor_m
>cpu_model2->__cpu_family = family;
>cpu_model2->__cpu_model = model;
>
> -  /* Find available features. */
> -  get_available_features (cpu_model, cpu_model2, cpu_features2,
> - ecx, edx);
>/* Get CPU type.  */
>get_amd_cpu (cpu_model, cpu_model2, cpu_features2);
>cpu_model->__cpu_vendor = VENDOR_AMD;
> @@ -1059,22 +1057,17 @@ cpu_indicator_init (struct __processor_m
>else if (vendor == signature_CENTAUR_ebx && family < 0x07)
>  cpu_model->__cpu_vendor = VENDOR_CENTAUR;
>else if (vendor == signature_SHANGHAI_ebx
> -   || vendor == signature_CENTAUR_ebx)
> +  || vendor == signature_CENTAUR_ebx)
>  {
>/* Adjust model and family for ZHAOXIN CPUS.  */
>if (family == 0x07)
> -   {
> - model += extended_model;
> -   }
> +   model += extended_model;
>
>cpu_model2->__cpu_family = family;
>cpu_model2->__cpu_model = model;
>
> -  /* Find available features.  */
> -  get_available_features (cpu_model, cpu_model2, cpu_features2,
> - ecx, edx);
>/* Get CPU type.  */
> -  get_zhaoxin_cpu (cpu_model, cpu_model2,cpu_features2);
> +  get_zhaoxin_cpu (cpu_model, cpu_model2, cpu_features2);
>cpu_model->__cpu_vendor = VENDOR_ZHAOXIN;
>  }
>else if (vendor == signature_CYRIX_ebx)
>
> Jakub
>


-- 
H.J.


[PATCH] libsanitizer: cherry-pick commit 742bcbf685bc from upstream

2023-01-31 Thread H.J. Lu via Gcc-patches
cherry-pick:

742bcbf685bc compiler-rt/lib: Add .Linterceptor_sigsetjmp

PR sanitizer/108106
* hwasan/hwasan_setjmp_x86_64.S (__interceptor_setjmp): Jump
to .Linterceptor_sigsetjmp instead of __interceptor_sigsetjmp.
(__interceptor_sigsetjmp): Add a local alias,
.Linterceptor_sigsetjmp.
---
 libsanitizer/hwasan/hwasan_setjmp_x86_64.S | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libsanitizer/hwasan/hwasan_setjmp_x86_64.S 
b/libsanitizer/hwasan/hwasan_setjmp_x86_64.S
index 7566c1ea0a5..a5a3858d94d 100644
--- a/libsanitizer/hwasan/hwasan_setjmp_x86_64.S
+++ b/libsanitizer/hwasan/hwasan_setjmp_x86_64.S
@@ -37,13 +37,14 @@ __interceptor_setjmp:
   CFI_STARTPROC
   _CET_ENDBR
   xorl %esi, %esi
-  jmp  __interceptor_sigsetjmp
+  jmp  .Linterceptor_sigsetjmp
   CFI_ENDPROC
 ASM_SIZE(__interceptor_setjmp)
 
 .global __interceptor_sigsetjmp
 ASM_TYPE_FUNCTION(__interceptor_sigsetjmp)
 __interceptor_sigsetjmp:
+.Linterceptor_sigsetjmp:
   CFI_STARTPROC
   _CET_ENDBR
 
-- 
2.39.1



[PATCH] x86: Check invalid third argument to __builtin_ia32_prefetch

2023-01-18 Thread H.J. Lu via Gcc-patches
Check invalid third argument to __builtin_ia32_prefetch when expaning
__builtin_ia32_prefetch to avoid ICE later.

gcc/

PR target/108436
* config/i386/i386-expand.cc (ix86_expand_builtin): Check
invalid third argument to __builtin_ia32_prefetch.

gcc/testsuite/

* gcc.target/i386/pr108436.c: New test.
---
 gcc/config/i386/i386-expand.cc   | 12 
 gcc/testsuite/gcc.target/i386/pr108436.c | 15 +++
 2 files changed, 27 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr108436.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 54f700cd09d..e2e2d28bb47 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -13175,6 +13175,12 @@ ix86_expand_builtin (tree exp, rtx target, rtx 
subtarget,
 
if (INTVAL (op3) == 1)
  {
+   if (INTVAL (op2) < 2 || INTVAL (op2) > 3)
+ {
+   error ("invalid third argument");
+   return const0_rtx;
+ }
+
if (TARGET_64BIT && TARGET_PREFETCHI
&& local_func_symbolic_operand (op0, GET_MODE (op0)))
  emit_insn (gen_prefetchi (op0, op2));
@@ -13195,6 +13201,12 @@ ix86_expand_builtin (tree exp, rtx target, rtx 
subtarget,
op0 = copy_addr_to_reg (op0);
  }
 
+   if (INTVAL (op2) < 0 || INTVAL (op2) > 3)
+ {
+   warning (0, "invalid third argument to 
%<__builtin_ia32_prefetch%>; using zero");
+   op2 = const0_rtx;
+ }
+
if (TARGET_3DNOW || TARGET_PREFETCH_SSE
|| TARGET_PRFCHW || TARGET_PREFETCHWT1)
  emit_insn (gen_prefetch (op0, op1, op2));
diff --git a/gcc/testsuite/gcc.target/i386/pr108436.c 
b/gcc/testsuite/gcc.target/i386/pr108436.c
new file mode 100644
index 000..d51f25863a5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr108436.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-mprefetchi" } */
+
+int
+foo (int a)
+{
+  return a + 1;
+}
+
+void
+bad (int *p)
+{
+  __builtin_ia32_prefetch (p, 0, 4, 0);   /* { dg-warning "invalid third 
argument to '__builtin_ia32_prefetch'; using zero" } */
+  __builtin_ia32_prefetch (foo, 0, 4, 1);   /* { dg-error "invalid third 
argument" } */
+}
-- 
2.39.0



[PATCH] x86: Disable -mforce-indirect-call for PIC in 32-bit mode

2023-01-16 Thread H.J. Lu via Gcc-patches
-mforce-indirect-call generates invalid instruction in 32-bit MI thunk
since there are no available scratch registers in 32-bit PIC mode.
Disable -mforce-indirect-call for PIC in 32-bit mode when generating
MI thunk.

gcc/

PR target/105980
* config/i386/i386.cc (x86_output_mi_thunk): Disable
-mforce-indirect-call for PIC in 32-bit mode.

gcc/testsuite/

PR target/105980
* g++.target/i386/pr105980.C: New test.
---
 gcc/config/i386/i386.cc  | 6 ++
 gcc/testsuite/g++.target/i386/pr105980.C | 8 
 2 files changed, 14 insertions(+)
 create mode 100644 gcc/testsuite/g++.target/i386/pr105980.C

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 19fb03cfd44..3cacf738c4a 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -21480,6 +21480,7 @@ x86_output_mi_thunk (FILE *file, tree thunk_fndecl, 
HOST_WIDE_INT delta,
   rtx this_reg, tmp, fnaddr;
   unsigned int tmp_regno;
   rtx_insn *insn;
+  int saved_flag_force_indirect_call = flag_force_indirect_call;
 
   if (TARGET_64BIT)
 tmp_regno = R10_REG;
@@ -21492,6 +21493,9 @@ x86_output_mi_thunk (FILE *file, tree thunk_fndecl, 
HOST_WIDE_INT delta,
tmp_regno = DX_REG;
   else
tmp_regno = CX_REG;
+
+  if (flag_pic)
+  flag_force_indirect_call = 0;
 }
 
   emit_note (NOTE_INSN_PROLOGUE_END);
@@ -21659,6 +21663,8 @@ x86_output_mi_thunk (FILE *file, tree thunk_fndecl, 
HOST_WIDE_INT delta,
   final (insn, file, 1);
   final_end_function ();
   assemble_end_function (thunk_fndecl, fnname);
+
+  flag_force_indirect_call = saved_flag_force_indirect_call;
 }
 
 static void
diff --git a/gcc/testsuite/g++.target/i386/pr105980.C 
b/gcc/testsuite/g++.target/i386/pr105980.C
new file mode 100644
index 000..d8dbc332ea2
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr105980.C
@@ -0,0 +1,8 @@
+// { dg-do assemble { target { fpic } } }
+// { dg-options "-O0 -fpic -mforce-indirect-call" }
+
+struct A {
+  virtual ~A();
+};
+struct B : virtual A {};
+void bar() { B(); }
-- 
2.39.0



Re: [PATCH] libgfortran: Replace mutex with rwlock

2022-12-27 Thread H.J. Lu via Gcc-patches
On Sun, Dec 25, 2022 at 4:58 PM Steve Kargl via Gcc-patches
 wrote:
>
> On Wed, Dec 21, 2022 at 07:27:11PM -0500, Lipeng Zhu via Fortran wrote:
> > This patch try to introduce the rwlock and split the read/write to
> > unit_root tree and unit_cache with rwlock instead of the mutex to
> > increase CPU efficiency. In the get_gfc_unit function, the percentage
> > to step into the insert_unit function is around 30%, in most instances,
> > we can get the unit in the phase of reading the unit_cache or unit_root
> > tree. So split the read/write phase by rwlock would be an approach to
> > make it more parallel.
> >
> > BTW, the IPC metrics can increase from 0.25 to 2.2 in the Intel
> > SRP server with 220 cores. The benchmark we used is
> > https://github.com/rwesson/NEAT
> >
>
> The patch fails bootstrap on x86_64-*-freebsd.
>
> gmake[6]: Entering directory 
> '/home/kargl/gcc/obj/x86_64-unknown-freebsd14.0/libstdc++-v3/src/c++17'
> /bin/sh ../../libtool --tag CXX --tag disable-shared   --mode=compile 
> /home/kargl/gcc/obj/./gcc/xgcc -shared-libgcc -B/home/kargl/gcc/obj/./gcc 
> -nostdinc++ -L/home/kargl/gcc/obj/x86_64-unknown-freebsd14.0/libstdc++-v3/src 
> -L/home/kargl/gcc/obj/x86_64-unknown-freebsd14.0/libstdc++-v3/src/.libs 
> -L/home/kargl/gcc/obj/x86_64-unknown-freebsd14.0/libstdc++-v3/libsupc++/.libs 
> -B/home/kargl/work/x86_64-unknown-freebsd14.0/bin/ 
> -B/home/kargl/work/x86_64-unknown-freebsd14.0/lib/ -isystem 
> /home/kargl/work/x86_64-unknown-freebsd14.0/include -isystem 
> /home/kargl/work/x86_64-unknown-freebsd14.0/sys-include   -fno-checking 
> -I/home/kargl/gcc/gcc/libstdc++-v3/../libgcc 
> -I/home/kargl/gcc/obj/x86_64-unknown-freebsd14.0/libstdc++-v3/include/x86_64-unknown-freebsd14.0
>  -I/home/kargl/gcc/obj/x86_64-unknown-freebsd14.0/libstdc++-v3/include 
> -I/home/kargl/gcc/gcc/libstdc++-v3/libsupc++   -std=gnu++17 -nostdinc++ 
> -prefer-pic -D_GLIBCXX_SHARED -fno-implicit-templates  -Wall -Wextra 
> -Wwrite-strings -Wcast-qual -Wabi=2  -fdiagnostics-show-location=once   
> -ffunction-sections -fdata-sections  -frandom-seed=floating_from_chars.lo  
> -fimplicit-templates -g -O2  -c -o floating_from_chars.lo 
> ../../../../../gcc/libstdc++-v3/src/c++17/floating_from_chars.cc
> libtool: compile:  /home/kargl/gcc/obj/./gcc/xgcc -shared-libgcc 
> -B/home/kargl/gcc/obj/./gcc -nostdinc++ 
> -L/home/kargl/gcc/obj/x86_64-unknown-freebsd14.0/libstdc++-v3/src 
> -L/home/kargl/gcc/obj/x86_64-unknown-freebsd14.0/libstdc++-v3/src/.libs 
> -L/home/kargl/gcc/obj/x86_64-unknown-freebsd14.0/libstdc++-v3/libsupc++/.libs 
> -B/home/kargl/work/x86_64-unknown-freebsd14.0/bin/ 
> -B/home/kargl/work/x86_64-unknown-freebsd14.0/lib/ -isystem 
> /home/kargl/work/x86_64-unknown-freebsd14.0/include -isystem 
> /home/kargl/work/x86_64-unknown-freebsd14.0/sys-include -fno-checking 
> -I/home/kargl/gcc/gcc/libstdc++-v3/../libgcc 
> -I/home/kargl/gcc/obj/x86_64-unknown-freebsd14.0/libstdc++-v3/include/x86_64-unknown-freebsd14.0
>  -I/home/kargl/gcc/obj/x86_64-unknown-freebsd14.0/libstdc++-v3/include 
> -I/home/kargl/gcc/gcc/libstdc++-v3/libsupc++ -std=gnu++17 -nostdinc++ 
> -D_GLIBCXX_SHARED -fno-implicit-templates -Wall -Wextra -Wwrite-strings 
> -Wcast-qual -Wabi=2 -fdiagnostics-show-location=once -ffunction-sections 
> -fdata-sections -frandom-seed=floating_from_chars.lo -fimplicit-templates -g 
> -O2 -c ../../../../../gcc/libstdc++-v3/src/c++17/floating_from_chars.cc  
> -fPIC -DPIC -D_GLIBCXX_SHARED -o floating_from_chars.o
> In file included from 
> /home/kargl/gcc/obj/x86_64-unknown-freebsd14.0/libstdc++-v3/include/memory_resource:40,
>  from 
> ../../../../../gcc/libstdc++-v3/src/c++17/floating_from_chars.cc:37:
> /home/kargl/gcc/obj/x86_64-unknown-freebsd14.0/libstdc++-v3/include/shared_mutex:
>  In function 'int std::__glibcxx_rwlock_rdlock(pthread_rwlock**)':
> /home/kargl/gcc/obj/x86_64-unknown-freebsd14.0/libstdc++-v3/include/shared_mutex:80:3:
>  error: call of overloaded '__gthrw_pthread_rwlock_rdlock(pthread_rwlock**&)' 
> is ambiguous
>80 |   _GLIBCXX_GTHRW(rwlock_rdlock)
>   |   ^
> In file included from 
> /home/kargl/gcc/obj/x86_64-unknown-freebsd14.0/libstdc++-v3/include/x86_64-unknown-freebsd14.0/bits/gthr.h:148,
>  from 
> /home/kargl/gcc/obj/x86_64-unknown-freebsd14.0/libstdc++-v3/include/bits/std_mutex.h:41,
>  from 
> /home/kargl/gcc/obj/x86_64-unknown-freebsd14.0/libstdc++-v3/include/shared_mutex:41:
> /home/kargl/gcc/obj/x86_64-unknown-freebsd14.0/libstdc++-v3/include/shared_mutex:80:3:
>  note: candidate: 'int std::__gthrw_pthread_rwlock_rdlock(pthread_rwlock**)'
>80 |   _GLIBCXX_GTHRW(rwlock_rdlock)
>   |   ^~
> /home/kargl/gcc/obj/x86_64-unknown-freebsd14.0/libstdc++-v3/include/x86_64-unknown-freebsd14.0/bits/gthr-default.h:140:1:
>  note: candidate: 'int __gthrw_pthread_rwlock_rdlock(pthread_rwlock**)'
>   140 | __gthrw(pthread_rwlock_rdlock)
>   | ^~~
> 

Re: [PATCH V2 2/2] [x86] x86: Add a new option -mdaz-ftz to enable FTZ and DAZ flags in MXCSR.

2022-12-21 Thread H.J. Lu via Gcc-patches
On Wed, Dec 21, 2022 at 2:35 PM Jakub Jelinek  wrote:
>
> On Wed, Dec 21, 2022 at 12:20:23PM -0800, H.J. Lu wrote:
> > On Mon, Dec 19, 2022 at 8:52 PM Hongtao Liu  wrote:
> > >
> > > On Thu, Dec 15, 2022 at 3:45 PM Hongtao Liu  wrote:
> > > >
> > > > On Thu, Dec 15, 2022 at 3:39 PM Jakub Jelinek  wrote:
> > > > >
> > > > > On Thu, Dec 15, 2022 at 02:21:37PM +0800, liuhongt via Gcc-patches 
> > > > > wrote:
> > > > > > --- a/gcc/config/i386/i386.opt
> > > > > > +++ b/gcc/config/i386/i386.opt
> > > > > > @@ -420,6 +420,10 @@ mpc80
> > > > > >  Target RejectNegative
> > > > > >  Set 80387 floating-point precision to 80-bit.
> > > > > >
> > > > > > +mdaz-ftz
> > > > > > +Target
> > > > >
> > > > > s/Target/Driver/
> > > > Change to Driver and Got error like:cc1: error: command-line option
> > > > ‘-mdaz-ftz’ is valid for the driver but not for C.
> > > Hi Jakub:
> > >   I didn't find a good solution to handle this error after changing
> > > *Target* to *Driver*, Could you give some hints how to solve this
> > > problem?
> > > Or is it ok for you to mark this as *Target*(there won't be any save
> > > and restore in cfun since there's no variable defined here.)
> >
> > Since all -m* options are passed to cc1, -mdaz-ftz can't be marked
> > as Driver.  We need to give it a different name to mark it as Driver.
>
> It is ok like that.
>
> Jakub
>

The GCC driver handles -mno-XXX automatically for -mXXX.  Use
a different name needs to handle the negation.   Or we can do something
like this to check for CL_DRIVER before passing it to cc1.
-- 
H.J.
diff --git a/gcc/gcc.cc b/gcc/gcc.cc
index 2568d541196..87cbea11ae1 100644
--- a/gcc/gcc.cc
+++ b/gcc/gcc.cc
@@ -3851,7 +3851,7 @@ alloc_switch (void)
 
 static void
 save_switch (const char *opt, size_t n_args, const char *const *args,
-	 bool validated, bool known)
+	 bool validated, bool known, bool driver = false)
 {
   alloc_switch ();
   switches[n_switches].part1 = opt + 1;
@@ -3868,6 +3868,7 @@ save_switch (const char *opt, size_t n_args, const char *const *args,
   switches[n_switches].validated = validated;
   switches[n_switches].known = known;
   switches[n_switches].ordering = 0;
+  switches[n_switches].driver = driver;
   n_switches++;
 }
 
@@ -4575,7 +4576,8 @@ driver_handle_option (struct gcc_options *opts,
   if (do_save)
 save_switch (decoded->canonical_option[0],
 		 decoded->canonical_option_num_elements - 1,
-		 >canonical_option[1], validated, true);
+		 >canonical_option[1], validated, true,
+		 cl_options[opt_index].flags == CL_DRIVER);
   return true;
 }
 
@@ -7465,7 +7467,8 @@ check_live_switch (int switchnum, int prefix_length)
 static void
 give_switch (int switchnum, int omit_first_word)
 {
-  if ((switches[switchnum].live_cond & SWITCH_IGNORE) != 0)
+  if ((switches[switchnum].live_cond & SWITCH_IGNORE) != 0
+  || switches[switchnum].driver)
 return;
 
   if (!omit_first_word)
diff --git a/gcc/opts.h b/gcc/opts.h
index ce4fd5c39b9..2900f0d9168 100644
--- a/gcc/opts.h
+++ b/gcc/opts.h
@@ -561,6 +561,7 @@ struct switchstr
   bool known;
   bool validated;
   bool ordering;
+  bool driver;
 };
 
 #endif


Re: [PATCH V2 2/2] [x86] x86: Add a new option -mdaz-ftz to enable FTZ and DAZ flags in MXCSR.

2022-12-21 Thread H.J. Lu via Gcc-patches
On Mon, Dec 19, 2022 at 8:52 PM Hongtao Liu  wrote:
>
> On Thu, Dec 15, 2022 at 3:45 PM Hongtao Liu  wrote:
> >
> > On Thu, Dec 15, 2022 at 3:39 PM Jakub Jelinek  wrote:
> > >
> > > On Thu, Dec 15, 2022 at 02:21:37PM +0800, liuhongt via Gcc-patches wrote:
> > > > --- a/gcc/config/i386/i386.opt
> > > > +++ b/gcc/config/i386/i386.opt
> > > > @@ -420,6 +420,10 @@ mpc80
> > > >  Target RejectNegative
> > > >  Set 80387 floating-point precision to 80-bit.
> > > >
> > > > +mdaz-ftz
> > > > +Target
> > >
> > > s/Target/Driver/
> > Change to Driver and Got error like:cc1: error: command-line option
> > ‘-mdaz-ftz’ is valid for the driver but not for C.
> Hi Jakub:
>   I didn't find a good solution to handle this error after changing
> *Target* to *Driver*, Could you give some hints how to solve this
> problem?
> Or is it ok for you to mark this as *Target*(there won't be any save
> and restore in cfun since there's no variable defined here.)

Since all -m* options are passed to cc1, -mdaz-ftz can't be marked
as Driver.  We need to give it a different name to mark it as Driver.


-- 
H.J.


[PATCH] libsanitizer: Add __interceptor_sigsetjmp_internal

2022-12-16 Thread H.J. Lu via Gcc-patches
Add an internal alias to __interceptor_sigsetjmp to avoid R_X86_64_PC32
relocation for "jmp __interceptor_sigsetjmp" with old assemblers.

PR sanitizer/108106
* hwasan/hwasan_setjmp_x86_64.S (__interceptor_sigsetjmp): Add
an internal alias, __interceptor_sigsetjmp_internal.
---
 libsanitizer/hwasan/hwasan_setjmp_x86_64.S | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libsanitizer/hwasan/hwasan_setjmp_x86_64.S 
b/libsanitizer/hwasan/hwasan_setjmp_x86_64.S
index 7566c1ea0a5..071dcdcf613 100644
--- a/libsanitizer/hwasan/hwasan_setjmp_x86_64.S
+++ b/libsanitizer/hwasan/hwasan_setjmp_x86_64.S
@@ -37,13 +37,14 @@ __interceptor_setjmp:
   CFI_STARTPROC
   _CET_ENDBR
   xorl %esi, %esi
-  jmp  __interceptor_sigsetjmp
+  jmp  __interceptor_sigsetjmp_internal
   CFI_ENDPROC
 ASM_SIZE(__interceptor_setjmp)
 
 .global __interceptor_sigsetjmp
 ASM_TYPE_FUNCTION(__interceptor_sigsetjmp)
 __interceptor_sigsetjmp:
+__interceptor_sigsetjmp_internal:
   CFI_STARTPROC
   _CET_ENDBR
 
-- 
2.38.1



Re: [PATCH] [x86] Fix unrecognizable insn due to illegal immediate_operand (const_int 255) of QImode.

2022-11-29 Thread H.J. Lu via Gcc-patches
On Mon, Nov 28, 2022 at 11:04 PM Hongtao Liu  wrote:
>
> On Mon, Nov 28, 2022 at 9:06 PM liuhongt  wrote:
> >
> > For __builtin_ia32_vec_set_v16qi (a, -1, 2) with
> > !flag_signed_char. it's transformed to
> > __builtin_ia32_vec_set_v16qi (_4, 255, 2) in the gimple,
> > and expanded to (const_int 255) in the rtl. But for immediate_operand,
> > it expects (const_int 255) to be signed extended to
> > (const_int -1). The mismatch caused an unrecognizable insn error.
> >
> > expand_expr_real_1 generates (const_int 255) without considering the target 
> > mode.
> > I guess it's on purpose, so I'll leave that alone and only change the 
> > expander
> > in the backend. After applying convert_modes to (const_int 255),
> > it's transformed to (const_int -1) which fix the issue.
> >
> > Bootstrapped and regtested x86_64-pc-linux-gnu{-m32,}.
> > Ok for trunk(and backport to GCC-10/11/12 release branches)?
> Drop this patch since it's not a complete solution, there're also
> other QI builtins which is not handled.

I checked the x86 backend.  __builtin_ia32_vec_set_v16qi is the
only intrinsic with this issue.

> >
> > gcc/ChangeLog:
> >
> > PR target/107863
> > * config/i386/i386-expand.cc (ix86_expand_vec_set_builtin):
> > Convert op1 to target mode whenever mode mismatch.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/i386/pr107863.c: New test.
> > ---
> >  gcc/config/i386/i386-expand.cc   | 2 +-
> >  gcc/testsuite/gcc.target/i386/pr107863.c | 8 
> >  2 files changed, 9 insertions(+), 1 deletion(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr107863.c
> >
> > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> > index 0373c3614a4..c639ee3a9f7 100644
> > --- a/gcc/config/i386/i386-expand.cc
> > +++ b/gcc/config/i386/i386-expand.cc
> > @@ -12475,7 +12475,7 @@ ix86_expand_vec_set_builtin (tree exp)
> >op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
> >elt = get_element_number (TREE_TYPE (arg0), arg2);
> >
> > -  if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
> > +  if (GET_MODE (op1) != mode1)
> >  op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
> >
> >op0 = force_reg (tmode, op0);
> > diff --git a/gcc/testsuite/gcc.target/i386/pr107863.c 
> > b/gcc/testsuite/gcc.target/i386/pr107863.c
> > new file mode 100644
> > index 000..99fd85d9765
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr107863.c
> > @@ -0,0 +1,8 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-mavx2 -O" } */
> > +
> > +typedef char v16qi __attribute__((vector_size(16)));
> > +
> > +v16qi foo(v16qi a){
> > +  return __builtin_ia32_vec_set_v16qi (a, -1, 2);
> > +}
> > --
> > 2.27.0
> >
>
>
> --
> BR,
> Hongtao



-- 
H.J.


Re: [PATCH 0/2] Support HWASAN with Intel LAM

2022-11-28 Thread H.J. Lu via Gcc-patches
On Mon, Nov 28, 2022 at 6:40 AM Martin Liška  wrote:
>
> On 11/11/22 02:26, liuhongt via Gcc-patches wrote:
> >2 years ago, ARM folks support HWASAN[1] in GCC[2], and introduced 
> > several
> > target hooks(Many thanks to their work) so other backends can do similar
> > things if they have similar feature.
> >Intel LAM(linear Address Masking)[3 Charpter 14] supports similar 
> > feature with
> > the upper bits of pointers can be used as metadata, LAM support two modes:
> >LAM_U48:bits 48-62 can be used as metadata
> >LAM_U57:bits 57-62 can be used as metedata.
> >
> > These 2 patches mainly support those target hooks, but HWASAN is not really
> > enabled until the final decision for the LAM kernel interface which may take
> > quite a long time. We have verified our patches with a "fake" interface 
> > locally[4], and
> > decided to push the backend patches to the GCC13 to make other HWASAN 
> > developper's work
> > easy.
>
> Hello.
>
> A few random comments I noticed:
>
> 1) please document the new target -mlam in extend.texi
> 2) the description speaks about bits [48-62] or [57-62], can explain why the 
> patch contains:
>
> +  /* Mask off bit63 when LAM_U57.  */
> +  if (ix86_lam_type == lam_u57)
> ?
>
> 3) Shouldn't the -lman option emit GNU_PROPERTY_X86_FEATURE_1_LAM_U57 or 
> GNU_PROPERTY_X86_FEATURE_1_LAM_U48
> .gnu.property note?

Since there are no clear usages for these LAM bits, we can
leave them out for now.

> 4) Can you please explain Florian's comment here:
> https://gitlab.com/x86-psABIs/x86-64-ABI/-/merge_requests/13#note_1181396487
>
> Thanks,
> Martin
>
> >
> > [1] https://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html
> > [2] https://gcc.gnu.org/pipermail/gcc-patches/2020-November/557857.html
> > [3] 
> > https://www.intel.com/content/dam/develop/external/us/en/documents/architecture-instruction-set-extensions-programming-reference.pdf
> > [4] https://gitlab.com/x86-gcc/gcc/-/tree/users/intel/lam/master
> >
> >
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > Ok for trunk?
> >
> > liuhongt (2):
> >Implement hwasan target_hook.
> >Enable hwasan for x86-64.
> >
> >   gcc/config/i386/i386-expand.cc  |  12 
> >   gcc/config/i386/i386-options.cc |   3 +
> >   gcc/config/i386/i386-opts.h |   6 ++
> >   gcc/config/i386/i386-protos.h   |   2 +
> >   gcc/config/i386/i386.cc | 123 
> >   gcc/config/i386/i386.opt|  16 +
> >   libsanitizer/configure.tgt  |   1 +
> >   7 files changed, 163 insertions(+)
> >
>


-- 
H.J.


Re: [PATCH v4] eliminate mutex in fast path of __register_frame

2022-11-21 Thread H.J. Lu via Gcc-patches
On Mon, Nov 21, 2022 at 3:49 AM Jakub Jelinek via Gcc-patches
 wrote:
>
> On Mon, Nov 21, 2022 at 12:22:32PM +0100, Thomas Neumann via Gcc-patches 
> wrote:
> > > When dynamically linking a fast enough machine hides the latency, but when
> > > Statically linking or on slower devices this change caused a 5x increase 
> > > in
> > > Instruction count and 2x increase in cycle count before getting to main.
> > >
> > > This has been quite noticeable on smaller devices.  Is there a reason the 
> > > btree
> > > can't be initialized lazily? It seems a bit harsh to pay the cost of 
> > > unwinding at
> > > startup even when you don't throw exceptions..
> >
> > we cannot easily do that lazily because otherwise we need a mutex for lazy
> > initialization, which is exactly what we wanted to get rid of.
> >
> > Having said that, I am surprised that you saw a noticeable difference. On
> > most platforms there should not be dynamic frame registration at all, as the
> > regular frames are directly read from the ELF data.
> >
> > Can you please send me an precise description on how to reproduce the issue?
> > (Platform, tools, a VM if you have one would be great). I will then debug
> > this to improve the startup time.
>
> I can see it being called as well for -static linked binaries.
> -static links in crtbeginT.o which is libgcc/crtstuff.c built with
> CRTSTUFFT_O macro being defined among other things, and that disables
> USE_PT_GNU_EH_FRAME:
> #if defined(OBJECT_FORMAT_ELF) \
> && !defined(OBJECT_FORMAT_FLAT) \
> && defined(HAVE_LD_EH_FRAME_HDR) \
> && !defined(inhibit_libc) && !defined(CRTSTUFFT_O) \
> && defined(__GLIBC__) && __GLIBC__ >= 2
> #include 
> /* uClibc pretends to be glibc 2.2 and DT_CONFIG is defined in its link.h.
>But it doesn't use PT_GNU_EH_FRAME ELF segment currently.  */
> # if !defined(__UCLIBC__) \
>  && (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 2) \
>  || (__GLIBC__ == 2 && __GLIBC_MINOR__ == 2 && defined(DT_CONFIG)))
> #  define USE_PT_GNU_EH_FRAME
> # endif
> #endif
>
> I think .eh_frame_hdr was never used for statically linked programs,
> see already https://gcc.gnu.org/legacy-ml/gcc-patches/2001-12/msg01383.html
> We don't pass --eh-frame-hdr when linking statically and dl_iterate_phdr
> doesn't handle those.
> Now, if -static -Wl,--eh-frame-hdr is passed when linking to the driver,
> .eh_frame_hdr section is created and __GNU_EH_FRAME_HDR symbol points to
> the start of that section, so at least that section could be found
> if something in the crt files and libgcc is adjusted.  But e.g.
> i?86, nios2, frv and bfin we also need to find the got.  Also, would it
> work even for static PIEs?
>
> Jakub
>

There is

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54568

-- 
H.J.


Re: [PATCH v2] Always use TYPE_MODE instead of DECL_MODE for vector field

2022-11-07 Thread H.J. Lu via Gcc-patches
On Mon, Oct 24, 2022 at 11:28 PM Richard Biener
 wrote:
>
> On Mon, Oct 24, 2022 at 10:02 PM H.J. Lu  wrote:
> >
> > On Mon, Oct 24, 2022 at 12:12 AM Richard Biener
> >  wrote:
> > >
> > > On Fri, Oct 21, 2022 at 6:18 PM H.J. Lu  wrote:
> > > >
> > > > On Fri, Oct 21, 2022 at 2:33 AM Richard Biener
> > > >  wrote:
> > > > >
> > > > > On Thu, Oct 20, 2022 at 6:58 PM H.J. Lu via Gcc-patches
> > > > >  wrote:
> > > > > >
> > > > > > commit e034c5c895722e0092d2239cd8c2991db77d6d39
> > > > > > Author: Jakub Jelinek 
> > > > > > Date:   Sat Dec 2 08:54:47 2017 +0100
> > > > > >
> > > > > > PR target/78643
> > > > > > PR target/80583
> > > > > > * expr.c (get_inner_reference): If DECL_MODE of a 
> > > > > > non-bitfield
> > > > > > is BLKmode for vector field with vector raw mode, use 
> > > > > > TYPE_MODE
> > > > > > instead of DECL_MODE.
> > > > > >
> > > > > > fixed the case where DECL_MODE of a vector field is BLKmode and its
> > > > > > TYPE_MODE is a vector mode because of target attribute.  Remove the
> > > > > > BLKmode check for the case where DECL_MODE of a vector field is a 
> > > > > > vector
> > > > > > mode and its TYPE_MODE is BLKmode because of target attribute.
> > > > > >
> > > > > > gcc/
> > > > > >
> > > > > > PR target/107304
> > > > > > * expr.c (get_inner_reference): Always use TYPE_MODE for 
> > > > > > vector
> > > > > > field with vector raw mode.
> > > > > >
> > > > > > gcc/testsuite/
> > > > > >
> > > > > > PR target/107304
> > > > > > * gcc.target/i386/pr107304.c: New test.
> > > > > > ---
> > > > > >  gcc/expr.cc  |  3 +-
> > > > > >  gcc/testsuite/gcc.target/i386/pr107304.c | 39 
> > > > > > 
> > > > > >  2 files changed, 40 insertions(+), 2 deletions(-)
> > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr107304.c
> > > > > >
> > > > > > diff --git a/gcc/expr.cc b/gcc/expr.cc
> > > > > > index efe387e6173..9145193c2c1 100644
> > > > > > --- a/gcc/expr.cc
> > > > > > +++ b/gcc/expr.cc
> > > > > > @@ -7905,8 +7905,7 @@ get_inner_reference (tree exp, poly_int64_pod 
> > > > > > *pbitsize,
> > > > > >   /* For vector fields re-check the target flags, as 
> > > > > > DECL_MODE
> > > > > >  could have been set with different target flags than
> > > > > >  the current function has.  */
> > > > > > - if (mode == BLKmode
> > > > > > - && VECTOR_TYPE_P (TREE_TYPE (field))
> > > > > > + if (VECTOR_TYPE_P (TREE_TYPE (field))
> > > > > >   && VECTOR_MODE_P (TYPE_MODE_RAW (TREE_TYPE (field
> > > > >
> > > > > Isn't the check on TYPE_MODE_RAW also wrong then?  Btw, the mode could
> > > >
> > > > TYPE_MODE_RAW is always set to a vector mode for a vector type:
> > > >
> > > >/* Find an appropriate mode for the vector type.  */
> > > > if (TYPE_MODE (type) == VOIDmode)
> > > >   SET_TYPE_MODE (type,
> > > >  mode_for_vector (SCALAR_TYPE_MODE (innertype),
> > > >   nunits).else_blk ());
> > >
> > > But mode_for_vector can return a MODE_INT!
> >
> > You are right.
> >
> > >   /* For integers, try mapping it to a same-sized scalar mode.  */
> > >   if (GET_MODE_CLASS (innermode) == MODE_INT)
> > > {
> > >   poly_uint64 nbits = nunits * GET_MODE_BITSIZE (innermode);
> > >   if (int_mode_for_size (nbits, 0).exists ()
> > >   && have_regs_of_mode[mode])
> > > return mode;
> > >
> > > > But TYPE_MODE returns BLKmode if the vector mode is unsupported.
> > > >
> > > > > also be an integer mode.
> > > >
> > > > For a vector field, mode is either BLK mode or the vector mode.  Jakub,
> > > > can you comment on it?
> > >
> > > I think that for
> > >
> > > typedef int v2si __attribute__((vector_size(8)));
> > >
> > > struct X { int i; v2si j; };
> > >
> > > v2si should get DImode with -mno-sse?
> > >
> >
> > Currently GCC generates
> >
> > (insn 31 32 33 (set (subreg:DI (reg:V2SI 105) 0)
> > (reg:DI 84 [ _3 ])) "y2.c":12:11 -1
> >  (nil))
> >
> > With my patch, v2si gets DImode directly without SUBREG.
> >
> > Here is the v2 patch with the update commit message:
> >
> > Remove the BLKmode check for the case where DECL_MODE
> > of a vector field is a vector mode and its TYPE_MODE isn't a
> > vector mode because of target attribute.
> >
> > OK for master?
>
> OK.

OK to backport to release branches?

Thanks.

-- 
H.J.


[PATCH] Extend optimization for integer bit test on __atomic_fetch_[or|and]_*

2022-11-02 Thread H.J. Lu via Gcc-patches
Extend optimization for

_1 = __atomic_fetch_or_4 (ptr_6, 0x8000, _3);
_5 = (signed int) _1;
_4 = _5 >= 0;

to

_1 = __atomic_fetch_or_4 (ptr_6, 0x8000, _3);
_5 = (signed int) _1;
if (_5 >= 0)

gcc/

PR middle-end/102566
* tree-ssa-ccp.cc (optimize_atomic_bit_test_and): Also handle
if (_5 < 0) and if (_5 >= 0).

gcc/testsuite/

PR middle-end/102566
* g++.target/i386/pr102566-7.C
---
 gcc/testsuite/g++.target/i386/pr102566-7.C | 22 ++
 gcc/tree-ssa-ccp.cc| 84 ++
 2 files changed, 91 insertions(+), 15 deletions(-)
 create mode 100644 gcc/testsuite/g++.target/i386/pr102566-7.C

diff --git a/gcc/testsuite/g++.target/i386/pr102566-7.C 
b/gcc/testsuite/g++.target/i386/pr102566-7.C
new file mode 100644
index 000..ce90214f33d
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr102566-7.C
@@ -0,0 +1,22 @@
+/* { dg-do compile { target c++11 } } */
+/* { dg-options "-O2" } */
+
+#include 
+
+template
+void lock_bts(std::atomic ) { while (!(a.fetch_or(b) & b)); }
+template
+void lock_btr(std::atomic ) { while (a.fetch_and(~b) & b); }
+template
+void lock_btc(std::atomic ) { while (a.fetch_xor(b) & b); }
+template void lock_bts<1U<<30>(std::atomic );
+template void lock_btr<1U<<30>(std::atomic );
+template void lock_btc<1U<<30>(std::atomic );
+template void lock_bts<1U<<31>(std::atomic );
+template void lock_btr<1U<<31>(std::atomic );
+template void lock_btc<1U<<31>(std::atomic );
+
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btsl" 2 } } */
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btrl" 2 } } */
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btcl" 2 } } */
+/* { dg-final { scan-assembler-not "cmpxchg" } } */
diff --git a/gcc/tree-ssa-ccp.cc b/gcc/tree-ssa-ccp.cc
index 9778e776cf2..3a4b6bc1118 100644
--- a/gcc/tree-ssa-ccp.cc
+++ b/gcc/tree-ssa-ccp.cc
@@ -3471,17 +3471,35 @@ optimize_atomic_bit_test_and (gimple_stmt_iterator 
*gsip,
{
  gimple *use_nop_stmt;
  if (!single_imm_use (use_lhs, _p, _nop_stmt)
- || !is_gimple_assign (use_nop_stmt))
+ || (!is_gimple_assign (use_nop_stmt)
+ && gimple_code (use_nop_stmt) != GIMPLE_COND))
return false;
- tree use_nop_lhs = gimple_assign_lhs (use_nop_stmt);
- rhs_code = gimple_assign_rhs_code (use_nop_stmt);
- if (rhs_code != BIT_AND_EXPR)
+ /* Handle both
+_4 = _5 < 0;
+and
+if (_5 < 0)
+  */
+ tree use_nop_lhs = nullptr;
+ rhs_code = ERROR_MARK;
+ if (is_gimple_assign (use_nop_stmt))
{
- if (TREE_CODE (use_nop_lhs) == SSA_NAME
+ use_nop_lhs = gimple_assign_lhs (use_nop_stmt);
+ rhs_code = gimple_assign_rhs_code (use_nop_stmt);
+   }
+ if (!use_nop_lhs || rhs_code != BIT_AND_EXPR)
+   {
+ /* Also handle
+if (_5 < 0)
+  */
+ if (use_nop_lhs
+ && TREE_CODE (use_nop_lhs) == SSA_NAME
  && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (use_nop_lhs))
return false;
- if (rhs_code == BIT_NOT_EXPR)
+ if (use_nop_lhs && rhs_code == BIT_NOT_EXPR)
{
+ /* Handle
+_7 = ~_2;
+  */
  g = convert_atomic_bit_not (fn, use_nop_stmt, lhs,
  mask);
  if (!g)
@@ -3512,14 +3530,31 @@ optimize_atomic_bit_test_and (gimple_stmt_iterator 
*gsip,
}
  else
{
- if (TREE_CODE (TREE_TYPE (use_nop_lhs)) != BOOLEAN_TYPE)
-   return false;
+ tree cmp_rhs1, cmp_rhs2;
+ if (use_nop_lhs)
+   {
+ /* Handle
+_4 = _5 < 0;
+  */
+ if (TREE_CODE (TREE_TYPE (use_nop_lhs))
+ != BOOLEAN_TYPE)
+   return false;
+ cmp_rhs1 = gimple_assign_rhs1 (use_nop_stmt);
+ cmp_rhs2 = gimple_assign_rhs2 (use_nop_stmt);
+   }
+ else
+   {
+ /* Handle
+if (_5 < 0)
+  */
+ rhs_code = gimple_cond_code (use_nop_stmt);
+ cmp_rhs1 = gimple_cond_lhs (use_nop_stmt);
+ cmp_rhs2 = gimple_cond_rhs (use_nop_stmt);
+   }
  if (rhs_code != GE_EXPR && rhs_code != LT_EXPR)
return false;
- tree cmp_rhs1 = gimple_assign_rhs1 (use_nop_stmt);
  if (use_lhs != cmp_rhs1)
return false;
- tree cmp_rhs2 = gimple_assign_rhs2 (use_nop_stmt);
  if 

[PATCH] x86: Track converted/skipped registers in STV

2022-10-31 Thread H.J. Lu via Gcc-patches
When converting integer computations into vector ones, we build a chain
from an integer definition instruction together with all dependent use
instructions.  The integer computations on the chain are converted to
vector ones if the total vector costs are lower than the integer ones.
Since the same register may appear in multiple chains, if it has been
converted or skipped in one chain, its instances in the other chains
must also be converted or skipped, regardless if the total vector costs
are lower than integer ones.  Otherwise, we will get the unexpected
vector mode in integer instruction patterns.

To track skipped registers, we add a bitmap, skipped_regs, when converting
integer computations into vector ones.  When computing gain for vector
computations, we convert or skip a chain if any register on the chain has
been converted or skipped already.

Note: If 2 integer registers on a chain, one has been converted and the
other has been skipped already, it will lead to a compiler error since
we can't undo the conversion.

gcc/

PR target/106933
PR target/106959
* config/i386/i386-features.cc (scalar_chain::skipped_regs): New.
(scalar_chain::update_skipped_regs): Likewise.
(scalar_chain::check_convert_gain): Likewise.
(general_scalar_chain::compute_convert_gain ): Return gain if
check_convert_gain returns non-zero.
(general_scalar_chain::compute_convert_gain): Call
update_skipped_regs if a chain won't be converted.
(timode_scalar_chain::compute_convert_gain): Likewise.
(convert_scalars_to_vector): Initialize and release
scalar_chain::skipped_regs before and after its use.
* config/i386/i386-features.h (scalar_chain): Add
skipped_regs, check_convert_gain and update_skipped_regs.

gcc/testsuite/

* gcc.target/i386/pr106933.c: New test.
* gcc.target/i386/pr106959.c: Likewise.
---
 gcc/config/i386/i386-features.cc | 104 ++-
 gcc/config/i386/i386-features.h  |   5 ++
 gcc/testsuite/gcc.target/i386/pr106933.c |  17 
 gcc/testsuite/gcc.target/i386/pr106959.c |  13 +++
 4 files changed, 137 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106933.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106959.c

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index fd212262f50..d9d63cf8d22 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -273,6 +273,8 @@ xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
 
 unsigned scalar_chain::max_id = 0;
 
+bitmap_head scalar_chain::skipped_regs;
+
 namespace {
 
 /* Initialize new chain.  */
@@ -477,6 +479,72 @@ scalar_chain::build (bitmap candidates, unsigned insn_uid)
   BITMAP_FREE (queue);
 }
 
+/* Add all scalar mode registers, which are set by INSN and not used in
+   both vector and scalar modes, to skipped register map. */
+
+void
+scalar_chain::update_skipped_regs (rtx_insn *insn)
+{
+  for (df_ref def = DF_INSN_DEFS (insn);
+   def;
+   def = DF_REF_NEXT_LOC (def))
+{
+  rtx reg = DF_REF_REG (def);
+  if (GET_MODE (reg) == smode
+ && !bitmap_bit_p (defs_conv, REGNO (reg)))
+   bitmap_set_bit (_regs, REGNO (reg));
+}
+}
+
+/* Check convert gain for INSN.  Return 1 if any registers, which are
+   set or used by INSN, have been converted to vector mode.  Return -1
+   if any registers set by INSN are skipped in other chains.  Return 0
+   otherwise.  */
+
+int
+scalar_chain::check_convert_gain (rtx_insn *insn)
+{
+  for (df_ref def = DF_INSN_DEFS (insn);
+   def;
+   def = DF_REF_NEXT_LOC (def))
+{
+  rtx reg = DF_REF_REG (def);
+  if (GET_MODE (reg) == vmode)
+   {
+ if (dump_file)
+   fprintf (dump_file,
+"  Gain 1 for converted register r%d\n",
+REGNO (reg));
+ return 1;
+   }
+  else if (bitmap_bit_p (_regs, REGNO (reg)))
+   {
+ if (dump_file)
+   fprintf (dump_file,
+"  Gain -1 for skipped register r%d\n",
+REGNO (reg));
+ return -1;
+   }
+}
+
+  for (df_ref ref = DF_INSN_USES (insn);
+   ref;
+   ref = DF_REF_NEXT_LOC (ref))
+{
+  rtx reg = DF_REF_REG (ref);
+  if (GET_MODE (reg) == vmode)
+   {
+ if (dump_file)
+   fprintf (dump_file,
+"  Gain 1 for converted register r%d\n",
+REGNO (reg));
+ return 1;
+   }
+}
+
+  return 0;
+}
+
 /* Return a cost of building a vector costant
instead of using a scalar one.  */
 
@@ -515,10 +583,15 @@ general_scalar_chain::compute_convert_gain ()
   EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
 {
   rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
+  /* If check_convert_gain returns non-zero on any INSN, the chain
+must be converted 

Re: [PATCH] x86: Replace ne:CCC/ne:CCO with UNSPEC_CC_NE in neg patterns

2022-10-28 Thread H.J. Lu via Gcc-patches
On Fri, Oct 28, 2022 at 2:34 PM Segher Boessenkool
 wrote:
>
> On Wed, Oct 26, 2022 at 11:58:57AM -0700, H.J. Lu via Gcc-patches wrote:
> > In i386.md, neg patterns which set MODE_CC register like
> >
> > (set (reg:CCC FLAGS_REG)
> >  (ne:CCC (match_operand:SWI48 1 "general_reg_operand") (const_int 0)))
> >
> > can lead to errors when operand 1 is a constant value.  If FLAGS_REG in
>
> But it cannot be.  general_reg_operand will not allow that:
> ===
> (define_predicate "general_reg_operand"
>   (and (match_code "reg")
>(match_test "GENERAL_REGNO_P (REGNO (op))")))
> ===
>
> > (set (reg:CCC FLAGS_REG)
> >  (ne:CCC (const_int 2) (const_int 0)))
> >
> > is set to 1, RTX simplifiers may simplify
>

Here is another example:

(define_insn "*neg_ccc_1"
  [(set (reg:CCC FLAGS_REG)
(ne:CCC
  (match_operand:SWI 1 "nonimmediate_operand" "0")
  (const_int 0)))
   (set (match_operand:SWI 0 "nonimmediate_operand" "=m")
(neg:SWI (match_dup 1)))]
  ""
  "neg{}\t%0"
  [(set_attr "type" "negnot")
   (set_attr "mode" "")])

Operand 1 can be a known value.

H.J.


Re: [PATCH] x86: Replace ne:CCC/ne:CCO with UNSPEC_CC_NE in neg patterns

2022-10-28 Thread H.J. Lu via Gcc-patches
On Fri, Oct 28, 2022 at 1:35 AM Eric Botcazou  wrote:
>
> > (set (reg:SI 93)
> >  (neg:SI (ltu:SI (reg:CCC 17 flags) (const_int 0 [0]
> >
> > as
> >
> > (set (reg:SI 93)
> >  (neg:SI (ltu:SI (const_int 1) (const_int 0 [0]
> >
> > which leads to incorrect results since LTU on MODE_CC register isn't the
> > same as "unsigned less than" in x86 backend.
>
> That's not specific to the x86 back-end, i.e. it's a generic caveat.
>
> >   PR target/107172
> >   * config/i386/i386.md (UNSPEC_CC_NE): New.
> >   Replace ne:CCC/ne:CCO with UNSPEC_CC_NE in neg patterns.
>
> FWIW the SPARC back-end uses a COMPARE instead of an UNSPEC here.

COMPARE may also set CC register to a constant when both operands are
known constants.


-- 
H.J.


[PATCH] x86: Replace ne:CCC/ne:CCO with UNSPEC_CC_NE in neg patterns

2022-10-26 Thread H.J. Lu via Gcc-patches
In i386.md, neg patterns which set MODE_CC register like

(set (reg:CCC FLAGS_REG)
 (ne:CCC (match_operand:SWI48 1 "general_reg_operand") (const_int 0)))

can lead to errors when operand 1 is a constant value.  If FLAGS_REG in

(set (reg:CCC FLAGS_REG)
 (ne:CCC (const_int 2) (const_int 0)))

is set to 1, RTX simplifiers may simplify

(set (reg:SI 93)
 (neg:SI (ltu:SI (reg:CCC 17 flags) (const_int 0 [0]

as

(set (reg:SI 93)
 (neg:SI (ltu:SI (const_int 1) (const_int 0 [0]

which leads to incorrect results since LTU on MODE_CC register isn't the
same as "unsigned less than" in x86 backend.  To prevent RTL optimizers
from setting MODE_CC register to a constant, use UNSPEC_CC_NE to replace
ne:CCC/ne:CCO when setting FLAGS_REG in neg patterns.

gcc/

PR target/107172
* config/i386/i386.md (UNSPEC_CC_NE): New.
Replace ne:CCC/ne:CCO with UNSPEC_CC_NE in neg patterns.

gcc/testsuite/

PR target/107172
* gcc.target/i386/pr107172.c: New test.
---
 gcc/config/i386/i386.md  | 45 +---
 gcc/testsuite/gcc.target/i386/pr107172.c | 26 ++
 2 files changed, 51 insertions(+), 20 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107172.c

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index baf1f1f8fa2..aaa678e7314 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -113,6 +113,7 @@ (define_c_enum "unspec" [
   UNSPEC_PEEPSIB
   UNSPEC_INSN_FALSE_DEP
   UNSPEC_SBB
+  UNSPEC_CC_NE
 
   ;; For SSE/MMX support:
   UNSPEC_FIX_NOTRUNC
@@ -11470,7 +11471,7 @@ (define_insn_and_split "*neg2_doubleword"
   "&& reload_completed"
   [(parallel
 [(set (reg:CCC FLAGS_REG)
- (ne:CCC (match_dup 1) (const_int 0)))
+ (unspec:CCC [(match_dup 1) (const_int 0)] UNSPEC_CC_NE))
  (set (match_dup 0) (neg:DWIH (match_dup 1)))])
(parallel
 [(set (match_dup 2)
@@ -11499,7 +11500,8 @@ (define_peephole2
(match_operand:SWI48 1 "nonimmediate_gr_operand"))
(parallel
 [(set (reg:CCC FLAGS_REG)
- (ne:CCC (match_operand:SWI48 2 "general_reg_operand") (const_int 0)))
+ (unspec:CCC [(match_operand:SWI48 2 "general_reg_operand")
+  (const_int 0)] UNSPEC_CC_NE))
  (set (match_dup 2) (neg:SWI48 (match_dup 2)))])
(parallel
 [(set (match_dup 0)
@@ -11517,7 +11519,7 @@ (define_peephole2
&& !reg_mentioned_p (operands[2], operands[1])"
   [(parallel
 [(set (reg:CCC FLAGS_REG)
- (ne:CCC (match_dup 2) (const_int 0)))
+ (unspec:CCC [(match_dup 2) (const_int 0)] UNSPEC_CC_NE))
  (set (match_dup 2) (neg:SWI48 (match_dup 2)))])
(parallel
 [(set (match_dup 0)
@@ -11543,7 +11545,8 @@ (define_peephole2
  (clobber (reg:CC FLAGS_REG))])
(parallel
 [(set (reg:CCC FLAGS_REG)
- (ne:CCC (match_operand:SWI48 1 "general_reg_operand") (const_int 0)))
+ (unspec:CCC [(match_operand:SWI48 1 "general_reg_operand")
+  (const_int 0)] UNSPEC_CC_NE))
  (set (match_dup 1) (neg:SWI48 (match_dup 1)))])
(parallel
 [(set (match_dup 0)
@@ -11559,7 +11562,7 @@ (define_peephole2
   "REGNO (operands[0]) != REGNO (operands[1])"
   [(parallel
 [(set (reg:CCC FLAGS_REG)
- (ne:CCC (match_dup 1) (const_int 0)))
+ (unspec:CCC [(match_dup 1) (const_int 0)] UNSPEC_CC_NE))
  (set (match_dup 1) (neg:SWI48 (match_dup 1)))])
(parallel
 [(set (match_dup 0)
@@ -11635,9 +11638,9 @@ (define_insn "*negsi_2_zext"
 
 (define_insn "*neg_ccc_1"
   [(set (reg:CCC FLAGS_REG)
-   (ne:CCC
- (match_operand:SWI 1 "nonimmediate_operand" "0")
- (const_int 0)))
+   (unspec:CCC
+ [(match_operand:SWI 1 "nonimmediate_operand" "0")
+  (const_int 0)] UNSPEC_CC_NE))
(set (match_operand:SWI 0 "nonimmediate_operand" "=m")
(neg:SWI (match_dup 1)))]
   ""
@@ -11647,9 +11650,9 @@ (define_insn "*neg_ccc_1"
 
 (define_insn "*neg_ccc_2"
   [(set (reg:CCC FLAGS_REG)
-   (ne:CCC
- (match_operand:SWI 1 "nonimmediate_operand" "0")
- (const_int 0)))
+   (unspec:CCC
+ [(match_operand:SWI 1 "nonimmediate_operand" "0")
+  (const_int 0)] UNSPEC_CC_NE))
(clobber (match_scratch:SWI 0 "="))]
   ""
   "neg{}\t%0"
@@ -11659,8 +11662,8 @@ (define_insn "*neg_ccc_2"
 (define_expand "x86_neg_ccc"
   [(parallel
 [(set (reg:CCC FLAGS_REG)
- (ne:CCC (match_operand:SWI48 1 "register_operand")
- (const_int 0)))
+ (unspec:CCC [(match_operand:SWI48 1 "register_operand")
+  (const_int 0)] UNSPEC_CC_NE))
  (set (match_operand:SWI48 0 "register_operand")
  (neg:SWI48 (match_dup 1)))])])
 
@@ -11686,8 +11689,9 @@ (define_insn "*negqi_ext_2"
 ;; Negate with jump on overflow.
 (define_expand "negv3"
   [(parallel [(set (reg:CCO FLAGS_REG)
-  (ne:CCO (match_operand:SWI 1 "register_operand")
-  

[PATCH v2] Always use TYPE_MODE instead of DECL_MODE for vector field

2022-10-24 Thread H.J. Lu via Gcc-patches
On Mon, Oct 24, 2022 at 12:12 AM Richard Biener
 wrote:
>
> On Fri, Oct 21, 2022 at 6:18 PM H.J. Lu  wrote:
> >
> > On Fri, Oct 21, 2022 at 2:33 AM Richard Biener
> >  wrote:
> > >
> > > On Thu, Oct 20, 2022 at 6:58 PM H.J. Lu via Gcc-patches
> > >  wrote:
> > > >
> > > > commit e034c5c895722e0092d2239cd8c2991db77d6d39
> > > > Author: Jakub Jelinek 
> > > > Date:   Sat Dec 2 08:54:47 2017 +0100
> > > >
> > > > PR target/78643
> > > > PR target/80583
> > > > * expr.c (get_inner_reference): If DECL_MODE of a non-bitfield
> > > > is BLKmode for vector field with vector raw mode, use TYPE_MODE
> > > > instead of DECL_MODE.
> > > >
> > > > fixed the case where DECL_MODE of a vector field is BLKmode and its
> > > > TYPE_MODE is a vector mode because of target attribute.  Remove the
> > > > BLKmode check for the case where DECL_MODE of a vector field is a vector
> > > > mode and its TYPE_MODE is BLKmode because of target attribute.
> > > >
> > > > gcc/
> > > >
> > > > PR target/107304
> > > > * expr.c (get_inner_reference): Always use TYPE_MODE for vector
> > > > field with vector raw mode.
> > > >
> > > > gcc/testsuite/
> > > >
> > > > PR target/107304
> > > > * gcc.target/i386/pr107304.c: New test.
> > > > ---
> > > >  gcc/expr.cc  |  3 +-
> > > >  gcc/testsuite/gcc.target/i386/pr107304.c | 39 
> > > >  2 files changed, 40 insertions(+), 2 deletions(-)
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr107304.c
> > > >
> > > > diff --git a/gcc/expr.cc b/gcc/expr.cc
> > > > index efe387e6173..9145193c2c1 100644
> > > > --- a/gcc/expr.cc
> > > > +++ b/gcc/expr.cc
> > > > @@ -7905,8 +7905,7 @@ get_inner_reference (tree exp, poly_int64_pod 
> > > > *pbitsize,
> > > >   /* For vector fields re-check the target flags, as DECL_MODE
> > > >  could have been set with different target flags than
> > > >  the current function has.  */
> > > > - if (mode == BLKmode
> > > > - && VECTOR_TYPE_P (TREE_TYPE (field))
> > > > + if (VECTOR_TYPE_P (TREE_TYPE (field))
> > > >   && VECTOR_MODE_P (TYPE_MODE_RAW (TREE_TYPE (field
> > >
> > > Isn't the check on TYPE_MODE_RAW also wrong then?  Btw, the mode could
> >
> > TYPE_MODE_RAW is always set to a vector mode for a vector type:
> >
> >/* Find an appropriate mode for the vector type.  */
> > if (TYPE_MODE (type) == VOIDmode)
> >   SET_TYPE_MODE (type,
> >  mode_for_vector (SCALAR_TYPE_MODE (innertype),
> >   nunits).else_blk ());
>
> But mode_for_vector can return a MODE_INT!

You are right.

>   /* For integers, try mapping it to a same-sized scalar mode.  */
>   if (GET_MODE_CLASS (innermode) == MODE_INT)
> {
>   poly_uint64 nbits = nunits * GET_MODE_BITSIZE (innermode);
>   if (int_mode_for_size (nbits, 0).exists ()
>   && have_regs_of_mode[mode])
> return mode;
>
> > But TYPE_MODE returns BLKmode if the vector mode is unsupported.
> >
> > > also be an integer mode.
> >
> > For a vector field, mode is either BLK mode or the vector mode.  Jakub,
> > can you comment on it?
>
> I think that for
>
> typedef int v2si __attribute__((vector_size(8)));
>
> struct X { int i; v2si j; };
>
> v2si should get DImode with -mno-sse?
>

Currently GCC generates

(insn 31 32 33 (set (subreg:DI (reg:V2SI 105) 0)
(reg:DI 84 [ _3 ])) "y2.c":12:11 -1
 (nil))

With my patch, v2si gets DImode directly without SUBREG.

Here is the v2 patch with the update commit message:

Remove the BLKmode check for the case where DECL_MODE
of a vector field is a vector mode and its TYPE_MODE isn't a
vector mode because of target attribute.

OK for master?

Thanks.

-- 
H.J.
From 25995549d541ac30f17d28d51d53483298fa74e2 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" 
Date: Wed, 19 Oct 2022 12:53:35 -0700
Subject: [PATCH v2] Always use TYPE_MODE instead of DECL_MODE for vector field

commit e034c5c895722e0092d2239cd8c2991db77d6d39
Author: Jakub Jelinek 
Date:   Sat Dec 2 08:54:47 2017 +0100

	PR target/7

Re: [PATCH] Always use TYPE_MODE instead of DECL_MODE for vector field

2022-10-21 Thread H.J. Lu via Gcc-patches
On Fri, Oct 21, 2022 at 2:33 AM Richard Biener
 wrote:
>
> On Thu, Oct 20, 2022 at 6:58 PM H.J. Lu via Gcc-patches
>  wrote:
> >
> > commit e034c5c895722e0092d2239cd8c2991db77d6d39
> > Author: Jakub Jelinek 
> > Date:   Sat Dec 2 08:54:47 2017 +0100
> >
> > PR target/78643
> > PR target/80583
> > * expr.c (get_inner_reference): If DECL_MODE of a non-bitfield
> > is BLKmode for vector field with vector raw mode, use TYPE_MODE
> > instead of DECL_MODE.
> >
> > fixed the case where DECL_MODE of a vector field is BLKmode and its
> > TYPE_MODE is a vector mode because of target attribute.  Remove the
> > BLKmode check for the case where DECL_MODE of a vector field is a vector
> > mode and its TYPE_MODE is BLKmode because of target attribute.
> >
> > gcc/
> >
> > PR target/107304
> > * expr.c (get_inner_reference): Always use TYPE_MODE for vector
> > field with vector raw mode.
> >
> > gcc/testsuite/
> >
> > PR target/107304
> > * gcc.target/i386/pr107304.c: New test.
> > ---
> >  gcc/expr.cc  |  3 +-
> >  gcc/testsuite/gcc.target/i386/pr107304.c | 39 
> >  2 files changed, 40 insertions(+), 2 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr107304.c
> >
> > diff --git a/gcc/expr.cc b/gcc/expr.cc
> > index efe387e6173..9145193c2c1 100644
> > --- a/gcc/expr.cc
> > +++ b/gcc/expr.cc
> > @@ -7905,8 +7905,7 @@ get_inner_reference (tree exp, poly_int64_pod 
> > *pbitsize,
> >   /* For vector fields re-check the target flags, as DECL_MODE
> >  could have been set with different target flags than
> >  the current function has.  */
> > - if (mode == BLKmode
> > - && VECTOR_TYPE_P (TREE_TYPE (field))
> > + if (VECTOR_TYPE_P (TREE_TYPE (field))
> >   && VECTOR_MODE_P (TYPE_MODE_RAW (TREE_TYPE (field
>
> Isn't the check on TYPE_MODE_RAW also wrong then?  Btw, the mode could

TYPE_MODE_RAW is always set to a vector mode for a vector type:

   /* Find an appropriate mode for the vector type.  */
if (TYPE_MODE (type) == VOIDmode)
  SET_TYPE_MODE (type,
 mode_for_vector (SCALAR_TYPE_MODE (innertype),
  nunits).else_blk ());

But TYPE_MODE returns BLKmode if the vector mode is unsupported.

> also be an integer mode.

For a vector field, mode is either BLK mode or the vector mode.  Jakub,
can you comment on it?

>
> > mode = TYPE_MODE (TREE_TYPE (field));
> > }
> > diff --git a/gcc/testsuite/gcc.target/i386/pr107304.c 
> > b/gcc/testsuite/gcc.target/i386/pr107304.c
> > new file mode 100644
> > index 000..24d68795e7f
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr107304.c
> > @@ -0,0 +1,39 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O0 -march=tigerlake" } */
> > +
> > +#include 
> > +
> > +typedef union {
> > +  uint8_t v __attribute__((aligned(256))) __attribute__ ((vector_size(64 * 
> > sizeof(uint8_t;
> > +  uint8_t i[64] __attribute__((aligned(256)));
> > +} stress_vec_u8_64_t;
> > +
> > +typedef struct {
> > + struct {
> > +  stress_vec_u8_64_t s;
> > +  stress_vec_u8_64_t o;
> > +  stress_vec_u8_64_t mask1;
> > +  stress_vec_u8_64_t mask2;
> > + } u8_64;
> > +} stress_vec_data_t;
> > +
> > +__attribute__((target_clones("arch=alderlake", "default")))
> > +void
> > +stress_vecshuf_u8_64(stress_vec_data_t *data)
> > +{
> > +  stress_vec_u8_64_t *__restrict s;
> > +  stress_vec_u8_64_t *__restrict mask1;
> > +  stress_vec_u8_64_t *__restrict mask2;
> > +  register int i;
> > +
> > +  s = >u8_64.s;
> > +  mask1 = >u8_64.mask1;
> > +  mask2 = >u8_64.mask2;
> > +
> > +  for (i = 0; i < 256; i++) {  /* was i < 65536 */
> > +  stress_vec_u8_64_t tmp;
> > +
> > +  tmp.v = __builtin_shuffle(s->v, mask1->v);
> > +  s->v = __builtin_shuffle(tmp.v, mask2->v);
> > +  }
> > +}
> > --
> > 2.37.3
> >



-- 
H.J.


[PATCH] Always use TYPE_MODE instead of DECL_MODE for vector field

2022-10-20 Thread H.J. Lu via Gcc-patches
commit e034c5c895722e0092d2239cd8c2991db77d6d39
Author: Jakub Jelinek 
Date:   Sat Dec 2 08:54:47 2017 +0100

PR target/78643
PR target/80583
* expr.c (get_inner_reference): If DECL_MODE of a non-bitfield
is BLKmode for vector field with vector raw mode, use TYPE_MODE
instead of DECL_MODE.

fixed the case where DECL_MODE of a vector field is BLKmode and its
TYPE_MODE is a vector mode because of target attribute.  Remove the
BLKmode check for the case where DECL_MODE of a vector field is a vector
mode and its TYPE_MODE is BLKmode because of target attribute.

gcc/

PR target/107304
* expr.c (get_inner_reference): Always use TYPE_MODE for vector
field with vector raw mode.

gcc/testsuite/

PR target/107304
* gcc.target/i386/pr107304.c: New test.
---
 gcc/expr.cc  |  3 +-
 gcc/testsuite/gcc.target/i386/pr107304.c | 39 
 2 files changed, 40 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107304.c

diff --git a/gcc/expr.cc b/gcc/expr.cc
index efe387e6173..9145193c2c1 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -7905,8 +7905,7 @@ get_inner_reference (tree exp, poly_int64_pod *pbitsize,
  /* For vector fields re-check the target flags, as DECL_MODE
 could have been set with different target flags than
 the current function has.  */
- if (mode == BLKmode
- && VECTOR_TYPE_P (TREE_TYPE (field))
+ if (VECTOR_TYPE_P (TREE_TYPE (field))
  && VECTOR_MODE_P (TYPE_MODE_RAW (TREE_TYPE (field
mode = TYPE_MODE (TREE_TYPE (field));
}
diff --git a/gcc/testsuite/gcc.target/i386/pr107304.c 
b/gcc/testsuite/gcc.target/i386/pr107304.c
new file mode 100644
index 000..24d68795e7f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107304.c
@@ -0,0 +1,39 @@
+/* { dg-do compile } */
+/* { dg-options "-O0 -march=tigerlake" } */
+
+#include 
+
+typedef union {
+  uint8_t v __attribute__((aligned(256))) __attribute__ ((vector_size(64 * 
sizeof(uint8_t;
+  uint8_t i[64] __attribute__((aligned(256)));
+} stress_vec_u8_64_t;
+
+typedef struct {
+ struct {
+  stress_vec_u8_64_t s;
+  stress_vec_u8_64_t o;
+  stress_vec_u8_64_t mask1;
+  stress_vec_u8_64_t mask2;
+ } u8_64;
+} stress_vec_data_t;
+
+__attribute__((target_clones("arch=alderlake", "default"))) 
+void
+stress_vecshuf_u8_64(stress_vec_data_t *data)
+{
+  stress_vec_u8_64_t *__restrict s;
+  stress_vec_u8_64_t *__restrict mask1;
+  stress_vec_u8_64_t *__restrict mask2;
+  register int i;
+
+  s = >u8_64.s;
+  mask1 = >u8_64.mask1;
+  mask2 = >u8_64.mask2;
+
+  for (i = 0; i < 256; i++) {  /* was i < 65536 */
+  stress_vec_u8_64_t tmp;
+
+  tmp.v = __builtin_shuffle(s->v, mask1->v);
+  s->v = __builtin_shuffle(tmp.v, mask2->v);
+  }
+}
-- 
2.37.3



Re: [PATCH 2/2] Support Intel prefetchit0/t1

2022-10-19 Thread H.J. Lu via Gcc-patches
On Fri, Oct 14, 2022 at 1:38 AM Haochen Jiang via Gcc-patches
 wrote:
>
> gcc/ChangeLog:
>
> * common/config/i386/cpuinfo.h (get_available_features):
> Detect PREFETCHI.
> * common/config/i386/i386-common.cc
> (OPTION_MASK_ISA2_PREFETCHI_SET,
> OPTION_MASK_ISA2_PREFETCHI_UNSET): New.
> (ix86_handle_option): Handle -mprefetchi.
> * common/config/i386/i386-cpuinfo.h (enum processor_features):
> Add FEATURE_PREFETCHI.
> * common/config/i386/i386-isas.h: Add ISA_NAME_TABLE_ENTRY for
> prefetchi.
> * config.gcc: Add prfchiintrin.h.
> * config/i386/cpuid.h (bit_PREFETCHI): New.
> * config/i386/i386-c.cc (ix86_target_macros_internal): Define
> __PREFETCHI__.
> * config/i386/i386-isa.def (PREFETCHI): Add DEF_PTA(PREFETCHI).
> * config/i386/i386-options.cc (ix86_valid_target_attribute_inner_p):
> Handle prefetchi.
> * config/i386/i386.md (prefetch): Add handler for prefetchi
> (*prefetch_i): New define_insn.
> * config/i386/i386.opt: Add option -mprefetchi.
> * config/i386/immintrin.h: Include prfchiintrin.h.
> * config/i386/predicates.md (local_func_symbolic_operand):
> New predicates.
> * config/i386/xmmintrin.h (enum _mm_hint): New enum for prefetchi.
> (_mm_prefetch): Handle the highest bit of enum.
> * doc/extend.texi: Document prefetchi.
> * doc/invoke.texi: Document -mprefetchi.
> * doc/sourcebuild.texi: Document target prefetchi.
> * config/i386/prfchiintrin.h: New file.
>
> gcc/testsuite/ChangeLog:
>
> * g++.dg/other/i386-2.C: Add -mprefetchi.
> * g++.dg/other/i386-3.C: Ditto.
> * gcc.misc-tests/i386-pf-3dnow-1.c: Add scan-assembler-not for
> prefetchit0/t1.
> * gcc.misc-tests/i386-pf-athlon-1.c: Ditto.
> * gcc.misc-tests/i386-pf-sse-1.c: Ditto.
> * gcc.target/i386/avx-1.c: Add -mprefetchi.
> * gcc.target/i386/avx-2.c: Ditto.
> * gcc.target/i386/funcspec-56.inc: Add new target attribute.
> * gcc.target/i386/prefetchi-1.c: Rewrite testcase.
> * gcc.target/i386/prefetchi-2.c: New test.
> * gcc.target/i386/prefetchi-3.c: Ditto.
> * gcc.target/i386/sse-12.c: Add -mprefetchi.
> * gcc.target/i386/sse-13.c: Ditto.
> * gcc.target/i386/sse-14.c: Ditto.
> * gcc.target/i386/sse-22.c: Add prefetchi.
> * gcc.target/i386/sse-23.c: Ditto.
>
> Co-authored-by: Hongtao Liu 
> ---
>  gcc/common/config/i386/cpuinfo.h  |  2 +
>  gcc/common/config/i386/i386-common.cc | 15 
>  gcc/common/config/i386/i386-cpuinfo.h |  1 +
>  gcc/common/config/i386/i386-isas.h|  1 +
>  gcc/config.gcc|  2 +-
>  gcc/config/i386/cpuid.h   |  1 +
>  gcc/config/i386/i386-c.cc |  2 +
>  gcc/config/i386/i386-isa.def  |  1 +
>  gcc/config/i386/i386-options.cc   |  4 +-
>  gcc/config/i386/i386.md   | 90 +--
>  gcc/config/i386/i386.opt  |  4 +
>  gcc/config/i386/immintrin.h   |  2 +
>  gcc/config/i386/predicates.md | 15 
>  gcc/config/i386/prfchiintrin.h| 39 
>  gcc/config/i386/xmmintrin.h   |  6 +-
>  gcc/doc/extend.texi   |  5 ++
>  gcc/doc/invoke.texi   | 10 ++-
>  gcc/doc/sourcebuild.texi  |  3 +
>  gcc/testsuite/g++.dg/other/i386-2.C   |  2 +-
>  gcc/testsuite/g++.dg/other/i386-3.C   |  2 +-
>  .../gcc.misc-tests/i386-pf-3dnow-1.c  |  2 +
>  .../gcc.misc-tests/i386-pf-athlon-1.c |  2 +
>  gcc/testsuite/gcc.misc-tests/i386-pf-sse-1.c  |  2 +
>  gcc/testsuite/gcc.target/i386/avx-1.c |  2 +-
>  gcc/testsuite/gcc.target/i386/avx-2.c |  2 +-
>  gcc/testsuite/gcc.target/i386/funcspec-56.inc |  2 +
>  gcc/testsuite/gcc.target/i386/prefetchi-1.c   | 36 ++--
>  gcc/testsuite/gcc.target/i386/prefetchi-2.c   | 26 ++
>  gcc/testsuite/gcc.target/i386/prefetchi-3.c   | 15 
>  gcc/testsuite/gcc.target/i386/sse-12.c|  2 +-
>  gcc/testsuite/gcc.target/i386/sse-13.c|  2 +-
>  gcc/testsuite/gcc.target/i386/sse-14.c|  2 +-
>  gcc/testsuite/gcc.target/i386/sse-22.c|  4 +-
>  gcc/testsuite/gcc.target/i386/sse-23.c|  2 +-
>  34 files changed, 259 insertions(+), 49 deletions(-)
>  create mode 100644 gcc/config/i386/prfchiintrin.h
>  create mode 100644 gcc/testsuite/gcc.target/i386/prefetchi-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/prefetchi-3.c
>
> diff --git a/gcc/common/config/i386/cpuinfo.h 
> b/gcc/common/config/i386/cpuinfo.h
> index 118f3a42abd..551e0483330 100644
> --- a/gcc/common/config/i386/cpuinfo.h
> +++ b/gcc/common/config/i386/cpuinfo.h
> @@ -797,6 

Re: [PATCH] Canonicalize vec_perm index to make the first index come from the first vector.

2022-10-18 Thread H.J. Lu via Gcc-patches
On Tue, Oct 18, 2022 at 4:25 PM liuhongt  wrote:
>
> Fix unexpected non-canon form from gimple vector selector.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?
>
> gcc/ChangeLog:
>
> PR target/107271
> * config/i386/i386-expand.cc (ix86_vec_perm_index_canon): New.
> (expand_vec_perm_shufps_shufps): Call
> ix86_vec_perm_index_canon
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/pr107271.c: New test.
> ---
>  gcc/config/i386/i386-expand.cc   | 17 +
>  gcc/testsuite/gcc.target/i386/pr107271.c | 16 
>  2 files changed, 33 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107271.c
>
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index 6baff6d0e61..4f121516091 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -19604,6 +19604,22 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
>return false;
>  }
>
> +/* Canonicalize vec_perm index to make the first index
> +   always comes from the first index.  */
vector?
> +static void
> +ix86_vec_perm_index_canon (struct expand_vec_perm_d *d)
> +{
> +  unsigned nelt = d->nelt;
> +  if (d->perm[0] < nelt)
> +return;
> +
> +  for (unsigned i = 0; i != nelt; i++)
> +d->perm[i] = (d->perm[i] + nelt) % (2 * nelt);
> +
> +  std::swap (d->op0, d->op1);
> +  return;
> +}
> +
>  /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
> in terms of a pair of shufps+ shufps/pshufd instructions.  */
>  static bool
> @@ -19621,6 +19637,7 @@ expand_vec_perm_shufps_shufps (struct 
> expand_vec_perm_d *d)
>if (d->testing_p)
>  return true;
>
> +  ix86_vec_perm_index_canon (d);
>for (i = 0; i < 4; ++i)
>  count += d->perm[i] > 3 ? 1 : 0;
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr107271.c 
> b/gcc/testsuite/gcc.target/i386/pr107271.c
> new file mode 100644
> index 000..fe89c9a5bef
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107271.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O0" } */
> +
> +typedef int __attribute__((__vector_size__ (16))) V;
> +
> +static inline __attribute__((__always_inline__)) V
> +bar (V v128u32_0)
> +{
> +  return __builtin_shuffle ((V){}, v128u32_0, v128u32_0);
> +}
> +
> +V
> +foo (void)
> +{
> +  return bar ((V){7, 4, 4});
> +}
> --
> 2.27.0
>


-- 
H.J.


Re: [COMMITTED 4/4] PR tree-optimization/102540 - propagate partial equivs in the cache.

2022-10-17 Thread H.J. Lu via Gcc-patches
On Thu, Oct 13, 2022 at 8:32 AM Andrew MacLeod via Gcc-patches
 wrote:
>
> Rangers on entry cache propagation already evaluates equivalences when
> calculating values. This patch also allows it to work with partial
> equivalences, and if the bit sizes are compatible, make use of those
> ranges as well.
>
> It attempts to be conservative, so should be safe.
>
> This resolves regressions in both PR 102540 and PR 102872.
>
> Bootstrapped on x86_64-pc-linux-gnu with no regressions.  Pushed
>
> Andrew

This caused:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107273

-- 
H.J.


Re: [PATCH] Add new target hook: simplify_modecc_const.

2022-10-14 Thread H.J. Lu via Gcc-patches
On Fri, Oct 14, 2022 at 1:32 PM Jeff Law via Gcc-patches
 wrote:
>
>
> On 10/10/22 09:50, H.J. Lu via Gcc-patches wrote:
> > On Thu, Jul 28, 2022 at 5:40 AM Richard Sandiford via Gcc-patches
> >  wrote:
> >> Seems this thread has become a bit heated, so I'll try to proceed
> >> with caution :-)
> >>
> >> In the below, I'll use "X-mode const_int" to mean "a const_int that
> >> is known from context to represent an X-mode value".  Of course,
> >> the const_int itself always stores VOIDmode.
> >>
> >> "Roger Sayle"  writes:
> >>> Hi Segher,
> >>> It's very important to distinguish the invariants that exist for the RTL
> >>> data structures as held in memory (rtx), vs. the use of "enum rtx_code"s,
> >>> "machine_mode"s and operands in the various processing functions
> >>> of the middle-end.
> >> FWIW, I agree this distinction is important, with the proviso (which
> >> I think you were also adding) that the code never loses track of what
> >> mode an rtx operand (stored in a variable) actually has/is being
> >> interpreted to have.
> >>
> >> In other words, the reason (zero_extend (const_int N)) is invalid is
> >> not that constant integers can't be extended in principle (of course
> >> they can).  It's invalid because we've lost track of how many bits
> >> that N actually has.  That problem doesn't apply in contexts where
> >> the operation is described using individual variables (rather than
> >> a single rtx) *provided that* one of those variables says what mode
> >> any potential const_ints actually represent.
> >>
> >>> Yes, it's very true that RTL integer constants don't specify a mode
> >>> (are VOIDmode), so therefore operations like ZERO_EXTEND or EQ
> >>> don't make sense with all constant operands.  This is (one reason)
> >>> why constant-only operands are disallowed from RTL (data structures),
> >>> and why in APIs that perform/simplify these operations, the original
> >>> operand mode (of the const_int(s)) must be/is always passed as a
> >>> parameter.
> >>>
> >>> Hence, for say simplify_const_binary_operation, op0 and op1 can
> >>> both be const_int, as the mode argument specifies the mode of the
> >>> "code" operation. Likewise, in simplify_relational_operation, both
> >>> op0 and op1 may be CONST_INT as "cmp_mode" explicitly specifies
> >>> the mode that the operation is performed in and "mode" specifies
> >>> the mode of the result.
> >> And the mode argument to simplify_const_relational_operation specifies
> >> the mode of the operands, not the mode of the result.  I.e. it specifies
> >> the modes of op0 and op1 rather than the mode that would be attached to
> >> the code in "(code:mode ...)" if an rtx were created with these parameters.
> >>
> >> That confused me when I saw the patch initially.  Elsewhere in the
> >> file "mode" tends to be the mode of the result, in cases where the
> >> mode of the result can be different from the modes of the operands,
> >> so using it for the mode of the operands seems a bit confusing
> >> (not your fault of course).
> >>
> >> I still struggle with the idea of having CC-mode const_ints though
> >> (using the meaning of "CC-mode const_ints" above).  I realise
> >> (compare (...) (const_int 0)) has been the norm "for ever", but here
> >> it feels like we're also blessing non-zero CC-mode const_ints.
> >> That raises the question of how many significant bits a CC-mode
> >> const_int actually has.  Currently:
> >>
> >>   ...  For historical reasons,
> >>   the size of a CC mode is four units.
> >>
> >> But treating CC-mode const_ints as having 32 significant bits is surely
> >> the wrong thing to do.
> >>
> >> So if we want to add more interpretation around CC modes, I think we
> >> should first clean up the representation to make the set of valid values
> >> more explicit.  (Preferably without reusing const_int for constant values,
> >> but that's probably a losing battle :-))
> >>
> >> Thanks,
> >> Richard
> > Here is a testcase to show that combine generates
> >
> > (set (reg:CCC 17 flags)
> > (ltu:SI (const_int 1 [1])
> >   (const_int 0 [0])))
> >
> > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107172
> >
> > This new target hook handles it properly
>
> ANd does it work if you reject MODE_CC comparisons with two constants in
> simplify_const_relational_operation?
>
>
> I suspect it will work, but generate suboptimal code.

It doesn't work for

(ltu:SI (const_int 1 [0x1]) (const_int 0 [0]))

simplified from

(ltu:SI (reg:CCC 17 flags) (const_int 0 [0]))

When simplify_const_relational_operation returns NULL for
MODE_CC comparison with two constants, combine will try
it again with VOIDmode comparison with two constants.

-- 
H.J.


Re: [PATCH][ICE] Fix for PR107193.

2022-10-10 Thread H.J. Lu via Gcc-patches
On Mon, Oct 10, 2022 at 5:37 PM Eugene Rozenfeld
 wrote:
>
> The bug was introduced in f30e9fd33e56a5a721346ea6140722e1b193db42.
> A variable (cur_locus_e) was incorrectly declared inside a loop.
> I also moved two other declarations (last and locus) down to make
> the code more clear.
>
> Tested on x86_64-pc-linux-gnu.
>
> gcc/ChangeLog:
> PR debug/107193
> * tree-cfg.cc (assign_discriminators): Move declaration of cur_locus_e
> out of the loop.
> ---
>  gcc/tree-cfg.cc | 7 ---
>  1 file changed, 4 insertions(+), 3 deletions(-)
>
> diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc
> index 41f2925665f..ae781871a19 100644
> --- a/gcc/tree-cfg.cc
> +++ b/gcc/tree-cfg.cc
> @@ -1204,9 +1204,8 @@ assign_discriminators (void)
>edge e;
>edge_iterator ei;
>gimple_stmt_iterator gsi;
> -  gimple *last = last_stmt (bb);
> -  location_t locus = last ? gimple_location (last) : UNKNOWN_LOCATION;
>location_t curr_locus = UNKNOWN_LOCATION;
> +  expanded_location curr_locus_e = {};
>int curr_discr = 0;
>
>/* Traverse the basic block, if two function calls within a basic block
> @@ -1215,7 +1214,7 @@ assign_discriminators (void)
>for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next ())
> {
>   gimple *stmt = gsi_stmt (gsi);
> - expanded_location curr_locus_e;
> +
>   if (curr_locus == UNKNOWN_LOCATION)
> {
>   curr_locus = gimple_location (stmt);
> @@ -1238,6 +1237,8 @@ assign_discriminators (void)
> curr_discr = next_discriminator_for_locus (curr_locus);
> }
>
> +  gimple *last = last_stmt (bb);
> +  location_t locus = last ? gimple_location (last) : UNKNOWN_LOCATION;
>if (locus == UNKNOWN_LOCATION)
> continue;
>
> --
> 2.25.1
>

It restored bootstrap for me.

Thanks.

-- 
H.J.


Re: [PATCH] Add new target hook: simplify_modecc_const.

2022-10-10 Thread H.J. Lu via Gcc-patches
On Thu, Jul 28, 2022 at 5:40 AM Richard Sandiford via Gcc-patches
 wrote:
>
> Seems this thread has become a bit heated, so I'll try to proceed
> with caution :-)
>
> In the below, I'll use "X-mode const_int" to mean "a const_int that
> is known from context to represent an X-mode value".  Of course,
> the const_int itself always stores VOIDmode.
>
> "Roger Sayle"  writes:
> > Hi Segher,
> > It's very important to distinguish the invariants that exist for the RTL
> > data structures as held in memory (rtx), vs. the use of "enum rtx_code"s,
> > "machine_mode"s and operands in the various processing functions
> > of the middle-end.
>
> FWIW, I agree this distinction is important, with the proviso (which
> I think you were also adding) that the code never loses track of what
> mode an rtx operand (stored in a variable) actually has/is being
> interpreted to have.
>
> In other words, the reason (zero_extend (const_int N)) is invalid is
> not that constant integers can't be extended in principle (of course
> they can).  It's invalid because we've lost track of how many bits
> that N actually has.  That problem doesn't apply in contexts where
> the operation is described using individual variables (rather than
> a single rtx) *provided that* one of those variables says what mode
> any potential const_ints actually represent.
>
> > Yes, it's very true that RTL integer constants don't specify a mode
> > (are VOIDmode), so therefore operations like ZERO_EXTEND or EQ
> > don't make sense with all constant operands.  This is (one reason)
> > why constant-only operands are disallowed from RTL (data structures),
> > and why in APIs that perform/simplify these operations, the original
> > operand mode (of the const_int(s)) must be/is always passed as a
> > parameter.
> >
> > Hence, for say simplify_const_binary_operation, op0 and op1 can
> > both be const_int, as the mode argument specifies the mode of the
> > "code" operation. Likewise, in simplify_relational_operation, both
> > op0 and op1 may be CONST_INT as "cmp_mode" explicitly specifies
> > the mode that the operation is performed in and "mode" specifies
> > the mode of the result.
>
> And the mode argument to simplify_const_relational_operation specifies
> the mode of the operands, not the mode of the result.  I.e. it specifies
> the modes of op0 and op1 rather than the mode that would be attached to
> the code in "(code:mode ...)" if an rtx were created with these parameters.
>
> That confused me when I saw the patch initially.  Elsewhere in the
> file "mode" tends to be the mode of the result, in cases where the
> mode of the result can be different from the modes of the operands,
> so using it for the mode of the operands seems a bit confusing
> (not your fault of course).
>
> I still struggle with the idea of having CC-mode const_ints though
> (using the meaning of "CC-mode const_ints" above).  I realise
> (compare (...) (const_int 0)) has been the norm "for ever", but here
> it feels like we're also blessing non-zero CC-mode const_ints.
> That raises the question of how many significant bits a CC-mode
> const_int actually has.  Currently:
>
>  ...  For historical reasons,
>  the size of a CC mode is four units.
>
> But treating CC-mode const_ints as having 32 significant bits is surely
> the wrong thing to do.
>
> So if we want to add more interpretation around CC modes, I think we
> should first clean up the representation to make the set of valid values
> more explicit.  (Preferably without reusing const_int for constant values,
> but that's probably a losing battle :-))
>
> Thanks,
> Richard

Here is a testcase to show that combine generates

(set (reg:CCC 17 flags)
   (ltu:SI (const_int 1 [1])
 (const_int 0 [0])))

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107172

This new target hook handles it properly.

-- 
H.J.


PING^1: [PATCH] x86: Check corrupted return address when unwinding stack

2022-10-04 Thread H.J. Lu via Gcc-patches
On Wed, Sep 21, 2022 at 1:42 PM H.J. Lu  wrote:
>
> If shadow stack is enabled, when unwinding stack, we count how many stack
> frames we pop to reach the landing pad and adjust shadow stack by the same
> amount.  When counting the stack frame, we compare the return address on
> normal stack against the return address on shadow stack.  If they don't
> match, return _URC_FATAL_PHASE2_ERROR for the corrupted return address on
> normal stack.  Don't check the return address for
>
> 1. Non-catchable exception where exception_class == 0.  Process will be
> terminated.
> 2. Zero return address which marks the outermost stack frame.
> 3. Signal stack frame since kernel puts a restore token on shadow stack.
>
> * unwind-generic.h (_Unwind_Frames_Increment): Add the EXC
> argument.
> * unwind.inc (_Unwind_RaiseException_Phase2): Pass EXC to
> _Unwind_Frames_Increment.
> (_Unwind_ForcedUnwind_Phase2): Likewise.
> * config/i386/shadow-stack-unwind.h (_Unwind_Frames_Increment):
> Take the EXC argument.  Return _URC_FATAL_PHASE2_ERROR if the
> return address on normal stack doesn't match the return address
> on shadow stack.
> ---
>  libgcc/config/i386/shadow-stack-unwind.h | 51 ++--
>  libgcc/unwind-generic.h  |  2 +-
>  libgcc/unwind.inc|  4 +-
>  3 files changed, 50 insertions(+), 7 deletions(-)
>
> diff --git a/libgcc/config/i386/shadow-stack-unwind.h 
> b/libgcc/config/i386/shadow-stack-unwind.h
> index 2b02682bdae..89d44165000 100644
> --- a/libgcc/config/i386/shadow-stack-unwind.h
> +++ b/libgcc/config/i386/shadow-stack-unwind.h
> @@ -54,10 +54,39 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  
> If not, see
> aligned.  If the original shadow stack is 8 byte aligned, we just
> need to pop 2 slots, one restore token, from shadow stack.  Otherwise,
> we need to pop 3 slots, one restore token + 4 byte padding, from
> -   shadow stack.  */
> -#ifndef __x86_64__
> +   shadow stack.
> +
> +   When popping a stack frame, we compare the return address on normal
> +   stack against the return address on shadow stack.  If they don't match,
> +   return _URC_FATAL_PHASE2_ERROR for the corrupted return address on
> +   normal stack.  Don't check the return address for
> +   1. Non-catchable exception where exception_class == 0.  Process will
> +  be terminated.
> +   2. Zero return address which marks the outermost stack frame.
> +   3. Signal stack frame since kernel puts a restore token on shadow
> +  stack.
> + */
>  #undef _Unwind_Frames_Increment
> -#define _Unwind_Frames_Increment(context, frames)  \
> +#ifdef __x86_64__
> +#define _Unwind_Frames_Increment(exc, context, frames) \
> +{  \
> +  frames++;\
> +  if (exc->exception_class != 0\
> + && _Unwind_GetIP (context) != 0   \
> + && !_Unwind_IsSignalFrame (context))  \
> +   {   \
> + _Unwind_Word ssp = _get_ssp ();   \
> + if (ssp != 0) \
> +   {   \
> + ssp += 8 * frames;\
> + _Unwind_Word ra = *(_Unwind_Word *) ssp;  \
> + if (ra != _Unwind_GetIP (context))\
> +   return _URC_FATAL_PHASE2_ERROR; \
> +   }   \
> +   }   \
> +}
> +#else
> +#define _Unwind_Frames_Increment(exc, context, frames) \
>if (_Unwind_IsSignalFrame (context)) \
>  do \
>{\
> @@ -83,5 +112,19 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  
> If not, see
>}\
>  while (0); \
>else \
> -frames++;
> +{  \
> +  frames++;\
> +  if (exc->exception_class != 0\
> + && _Unwind_GetIP (context) != 0)  \
> +   {   \
> + _Unwind_Word ssp = _get_ssp ();   \
> + if (ssp != 0) \
> +   {   \
> + ssp += 4 * frames;\
> + _Unwind_Word ra = *(_Unwind_Word *) ssp;  \
> + if (ra != _Unwind_GetIP (context))\
> +   return _URC_FATAL_PHASE2_ERROR; \
> + 

Re: [committed] Minor cleanup/prep in DOM

2022-09-30 Thread H.J. Lu via Gcc-patches
On Fri, Sep 30, 2022 at 4:06 PM Jeff Law  wrote:
>
>
> It's a bit weird that free_dom_edge_info leaves a dangling pointer in
> e->aux.  Not sure what I was thinking.
>
>
> There's two callers.  One wipes e->aux immediately after the call, the
> other attaches a newly created object immediately after the call.  So we
> can wipe e->aux within the call and simplify one of the two call sites.
>
> This is preparatory work for a minor optimization where we want to
> detect another class of edge equivalences in DOM (until something better
> is available) and either attach them an existing edge_info structure or
> create a new one if one doesn't currently exist for a given edge.
>
> Bootstrapped and regression tested on x86_64.  Installing on the trunk.
>

I got


/export/gnu/import/git/sources/gcc/gcc/tree-ssa-dom.cc: In function
‘void record_edge_info(basic_block)’:
/export/gnu/import/git/sources/gcc/gcc/tree-ssa-dom.cc:689:27: error:
‘dst’ was not declared in this scope; did you mean ‘dse’?
  689 |   if (dst == PHI_ARG_DEF (phi, !alternative))
  |   ^~~
  |   dse
In file included from /export/gnu/import/git/sources/gcc/gcc/gimple-ssa.h:24,
 from /export/gnu/import/git/sources/gcc/gcc/ssa.h:27,
 from /export/gnu/import/git/sources/gcc/gcc/tree-ssa-dom.cc:28:
/export/gnu/import/git/sources/gcc/gcc/tree-ssa-dom.cc:689:47: error:
‘phi’ was not declared in this scope; did you mean ‘gphi’?
  689 |   if (dst == PHI_ARG_DEF (phi, !alternative))
  |   ^~~
/export/gnu/import/git/sources/gcc/gcc/tree-ssa-operands.h:82:54:
note: in definition of macro ‘PHI_ARG_DEF’
   82 | #define PHI_ARG_DEF(PHI, I) gimple_phi_arg_def ((PHI), (I))
  |  ^~~
make: *** [Makefile:1146: tree-ssa-dom.o] Error 1

-- 
H.J.


Re: [PATCH] Document -fexcess-precision=16 in tm.texi

2022-09-30 Thread H.J. Lu via Gcc-patches
On Fri, Sep 30, 2022 at 3:25 PM Palmer Dabbelt  wrote:
>
> On Sat, 24 Sep 2022 19:13:36 PDT (-0700), san...@codesourcery.com wrote:
> > On 9/18/22 02:47, Palmer Dabbelt wrote:
> >> On Fri, 09 Sep 2022 02:46:40 PDT (-0700), Palmer Dabbelt wrote:
> >>> I just happened to stuble on this one while trying to sort out the
> >>> RISC-V bits.
> >>>
> >>> gcc/ChangeLog
> >>>
> >>> * doc/tm.texi (TARGET_C_EXCESS_PRECISION): Add 16.
> >>> ---
> >>>  gcc/doc/tm.texi | 2 +-
> >>>  1 file changed, 1 insertion(+), 1 deletion(-)
> >>>
> >>> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> >>> index 858bfb80cec..7590924f2ca 100644
> >>> --- a/gcc/doc/tm.texi
> >>> +++ b/gcc/doc/tm.texi
> >>> @@ -1009,7 +1009,7 @@ of the excess precision explicitly added.  For
> >>>  @code{EXCESS_PRECISION_TYPE_FLOAT16}, and
> >>>  @code{EXCESS_PRECISION_TYPE_FAST}, the target should return the
> >>>  explicit excess precision that should be added depending on the
> >>> -value set for @option{-fexcess-precision=@r{[}standard@r{|}fast@r{]}}.
> >>> +value set for
> >>> @option{-fexcess-precision=@r{[}standard@r{|}fast@r{|}16@r{]}}.
> >>>  Note that unpredictable explicit excess precision does not make sense,
> >>>  so a target should never return @code{FLT_EVAL_METHOD_UNPREDICTABLE}
> >>>  when @var{type} is @code{EXCESS_PRECISION_TYPE_STANDARD},
> >>
> >> Just pinging this one as I'm not sure if it's OK to self-approve -- no
> >> rush on my end, I already figured it out so I don't need the
> >> documentation any more.
> >
> > This is fine, looks like a trivial correction.
>
> Thanks, committed.

tm.texi is a generated file.  I am checking in this patch to restore bootstrap.

-- 
H.J.
--
diff --git a/gcc/target.def b/gcc/target.def
index 4d49ffc2c88..a3d3b04a165 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -6181,7 +6181,7 @@ of the excess precision explicitly added.  For\n\
 @code{EXCESS_PRECISION_TYPE_FLOAT16}, and\n\
 @code{EXCESS_PRECISION_TYPE_FAST}, the target should return the\n\
 explicit excess precision that should be added depending on the\n\
-value set for @option{-fexcess-precision=@r{[}standard@r{|}fast@r{]}}.\n\
+value set for 
@option{-fexcess-precision=@r{[}standard@r{|}fast@r{|}16@r{]}}.\n\
 Note that unpredictable explicit excess precision does not make sense,\n\
 so a target should never return @code{FLT_EVAL_METHOD_UNPREDICTABLE}\n\
 when @var{type} is @code{EXCESS_PRECISION_TYPE_STANDARD},\n\


[PATCH] i386: Mark XMM4-XMM6 as clobbered by encodekey128/encodekey256

2022-09-27 Thread H.J. Lu via Gcc-patches
encodekey128 and encodekey256 operations clear XMM4-XMM6.  But it is
documented that XMM4-XMM6 are reserved for future usages and software
should not rely upon them being zeroed.  Change encodekey128 and
encodekey256 to clobber XMM4-XMM6.

gcc/

PR target/107061
* config/i386/predicates.md (encodekey128_operation): Check
XMM4-XMM6 as clobbered.
(encodekey256_operation): Likewise.
* config/i386/sse.md (encodekey128u32): Clobber XMM4-XMM6.
(encodekey256u32): Likewise.

gcc/testsuite/

PR target/107061
* gcc.target/i386/keylocker-encodekey128.c: Don't check
XMM4-XMM6.
* gcc.target/i386/keylocker-encodekey256.c: Likewise.
---
 gcc/config/i386/predicates.md | 20 +--
 gcc/config/i386/sse.md|  4 ++--
 .../gcc.target/i386/keylocker-encodekey128.c  |  1 -
 .../gcc.target/i386/keylocker-encodekey256.c  |  1 -
 4 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 655eabf793b..c4141a96735 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -2107,11 +2107,11 @@ (define_predicate "encodekey128_operation"
   for(i = 4; i < 7; i++)
 {
   elt = XVECEXP (op, 0, i);
-  if (GET_CODE (elt) != SET
- || GET_CODE (SET_DEST (elt)) != REG
- || GET_MODE (SET_DEST (elt)) != V2DImode
- || REGNO (SET_DEST (elt)) != GET_SSE_REGNO (i)
- || SET_SRC (elt) != CONST0_RTX (V2DImode))
+  if (GET_CODE (elt) != CLOBBER
+ || GET_MODE (elt) != VOIDmode
+ || GET_CODE (XEXP (elt, 0)) != REG
+ || GET_MODE (XEXP (elt, 0)) != V2DImode
+ || REGNO (XEXP (elt, 0)) != GET_SSE_REGNO (i))
return false;
 }
 
@@ -2157,11 +2157,11 @@ (define_predicate "encodekey256_operation"
   for(i = 4; i < 7; i++)
 {
   elt = XVECEXP (op, 0, i + 1);
-  if (GET_CODE (elt) != SET
- || GET_CODE (SET_DEST (elt)) != REG
- || GET_MODE (SET_DEST (elt)) != V2DImode
- || REGNO (SET_DEST (elt)) != GET_SSE_REGNO (i)
- || SET_SRC (elt) != CONST0_RTX (V2DImode))
+  if (GET_CODE (elt) != CLOBBER
+ || GET_MODE (elt) != VOIDmode
+ || GET_CODE (XEXP (elt, 0)) != REG
+ || GET_MODE (XEXP (elt, 0)) != V2DImode
+ || REGNO (XEXP (elt, 0)) != GET_SSE_REGNO (i))
return false;
 }
 
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 5c189635124..076064f97e6 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -29015,7 +29015,7 @@ (define_expand "encodekey128u32"
 
   for (i = 4; i < 7; i++)
 XVECEXP (operands[2], 0, i)
-  = gen_rtx_SET (xmm_regs[i], CONST0_RTX (V2DImode));
+  = gen_rtx_CLOBBER (VOIDmode, xmm_regs[i]);
 
   XVECEXP (operands[2], 0, 7)
 = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
@@ -29072,7 +29072,7 @@ (define_expand "encodekey256u32"
 
   for (i = 4; i < 7; i++)
 XVECEXP (operands[2], 0, i + 1)
-  = gen_rtx_SET (xmm_regs[i], CONST0_RTX (V2DImode));
+  = gen_rtx_CLOBBER (VOIDmode, xmm_regs[i]);
 
   XVECEXP (operands[2], 0, 8)
 = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
diff --git a/gcc/testsuite/gcc.target/i386/keylocker-encodekey128.c 
b/gcc/testsuite/gcc.target/i386/keylocker-encodekey128.c
index 805e0628673..57fa9bdc831 100644
--- a/gcc/testsuite/gcc.target/i386/keylocker-encodekey128.c
+++ b/gcc/testsuite/gcc.target/i386/keylocker-encodekey128.c
@@ -6,7 +6,6 @@
 /* { dg-final { scan-assembler "(?:movdqu|movups)\[ 
\\t\]+\[^\\n\]*%xmm0,\[^\\n\\r\]*" } } */
 /* { dg-final { scan-assembler "(?:movdqu|movups)\[ 
\\t\]+\[^\\n\]*%xmm1,\[^\\n\\r\]*16\[^\\n\\r\]*" } } */
 /* { dg-final { scan-assembler "(?:movdqu|movups)\[ 
\\t\]+\[^\\n\]*%xmm2,\[^\\n\\r\]*32\[^\\n\\r\]*" } } */
-/* { dg-final { scan-assembler "(?:movdqa|movaps)\[ 
\\t\]+\[^\\n\]*%xmm\[4-6\],\[^\\n\\r\]*" } } */
 
 #include 
 
diff --git a/gcc/testsuite/gcc.target/i386/keylocker-encodekey256.c 
b/gcc/testsuite/gcc.target/i386/keylocker-encodekey256.c
index 26f04dcf014..a9398b4e7a2 100644
--- a/gcc/testsuite/gcc.target/i386/keylocker-encodekey256.c
+++ b/gcc/testsuite/gcc.target/i386/keylocker-encodekey256.c
@@ -8,7 +8,6 @@
 /* { dg-final { scan-assembler "(?:movdqu|movups)\[ 
\\t\]+\[^\\n\]*%xmm1,\[^\\n\\r\]*16\[^\\n\\r\]*" } } */
 /* { dg-final { scan-assembler "(?:movdqu|movups)\[ 
\\t\]+\[^\\n\]*%xmm2,\[^\\n\\r\]*32\[^\\n\\r\]*" } } */
 /* { dg-final { scan-assembler "(?:movdqu|movups)\[ 
\\t\]+\[^\\n\]*%xmm3,\[^\\n\\r\]*48\[^\\n\\r\]*" } } */
-/* { dg-final { scan-assembler "(?:movdqa|movaps)\[ 
\\t\]+\[^\\n\]*%xmm\[4-6\],\[^\\n\\r\]*" } } */
 
 #include 
 
-- 
2.37.3



Re: [RFC] postreload cse'ing vector constants

2022-09-27 Thread H.J. Lu via Gcc-patches
On Tue, Sep 27, 2022 at 10:46 AM Robin Dapp via Gcc-patches
 wrote:
>
> > I did bootstrapping and ran the testsuite on x86(-64), aarch64, Power9
> > and s390.  Everything looks good except two additional fails on x86
> > where code actually looks worse.
> >
> > gcc.target/i386/keylocker-encodekey128.c
> >
> > 17c17,18
> > <   movaps  %xmm4, k2(%rip)
> > ---
> >>   pxor%xmm0, %xmm0
> >>   movaps  %xmm0, k2(%rip)
> >
> > gcc.target/i386/keylocker-encodekey256.c:
> >
> > 19c19,20
> > <   movaps  %xmm4, k3(%rip)
> > ---
> >>   pxor%xmm0, %xmm0
> >>   movaps  %xmm0, k3(%rip)
>
> Before the patch and after postreload we have:
>
> (insn (set (reg:V2DI xmm0)
> (reg:V2DI xmm4))
>  (expr_list:REG_DEAD (reg:V2DI 24 xmm4)
> (expr_list:REG_EQUIV (const_vector:V2DI [
> (const_int 0 [0]) repeated x2
> ])
> (insn (set (mem/c:V2DI (symbol_ref:DI ("k2"))
> (reg:V2DI xmm0
>
> which is converted by cprop_hardreg to:
>
> (insn (set (mem/c:V2DI (symbol_ref:DI ("k2")))
> (reg:V2DI xmm4
>
> With the change there is:
>
> (insn (set (reg:V2DI xmm0)
> (const_vector:V2DI [
> (const_int 0 [0]) repeated x2
> ])))
> (insn (set (mem/c:V2DI (symbol_ref:DI ("k2")))
> (reg:V2DI xmm0
>
> which is not simplified further because xmm0 needs to be explicitly
> zeroed while xmm4 is assumed to be zeroed by encodekey128.  I'm not
> familiar with this so I'm supposing this is correct even though I found
> "XMM4 through XMM6 are reserved for future usages and software should
> not rely upon them being zeroed." online.

I opened:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107061

> Even inf xmm4 were zeroed explicity, I guess in this case the simple
> costing of mov reg,reg vs mov reg,imm (with the latter not being more
> expensive) falls short?  cprop_hardreg can actually propagate the zeroed
> xmm4 into the next move.
> The same mechanism could possibly even elide many such moves which would
> mean we'd unnecessarily emit many mov reg,0?  Hmm...

This sounds like an issue.

-- 
H.J.


Re: [PATCH] Ignore debug insns with CONCAT and CONCATN for insn scheduling

2022-09-26 Thread H.J. Lu via Gcc-patches
On Sat, Sep 24, 2022 at 1:37 PM Jeff Law  wrote:
>
>
> On 9/21/22 16:11, H.J. Lu wrote:
> > On Wed, Sep 7, 2022 at 10:03 AM Jeff Law via Gcc-patches
> >  wrote:
> >>
> >>
> >> On 9/2/2022 8:36 AM, H.J. Lu via Gcc-patches wrote:
> >>> CONCAT and CONCATN never appear in the insn chain.  They are only used
> >>> in debug insn.  Ignore debug insns with CONCAT and CONCATN for insn
> >>> scheduling to avoid different insn orders with and without debug insn.
> >>>
> >>> gcc/
> >>>
> >>>PR rtl-optimization/106746
> >>>* sched-deps.cc (sched_analyze_2): Ignore debug insns with CONCAT
> >>>and CONCATN.
> >> Shouldn't we be ignoring everything in a debug insn?   I don't see why
> >> CONCAT/CONCATN are special here.
> > Debug insns are processed by insn scheduling.   I think it is to improve 
> > debug
> > experiences.  It is just that there are no matching usages of CONCAT/CONCATN
> > in non-debug insns.
>
> But from a dependency standpoint ISTM all debug insn can be ignored.  I
> still don't see why concat/concatn should be special here.
>

I tried to ignore everything in a debug insn.  It caused many regressions in
the GCC testsuite.

-- 
H.J.


Re: [PATCH] Ignore debug insns with CONCAT and CONCATN for insn scheduling

2022-09-21 Thread H.J. Lu via Gcc-patches
On Wed, Sep 7, 2022 at 10:03 AM Jeff Law via Gcc-patches
 wrote:
>
>
>
> On 9/2/2022 8:36 AM, H.J. Lu via Gcc-patches wrote:
> > CONCAT and CONCATN never appear in the insn chain.  They are only used
> > in debug insn.  Ignore debug insns with CONCAT and CONCATN for insn
> > scheduling to avoid different insn orders with and without debug insn.
> >
> > gcc/
> >
> >   PR rtl-optimization/106746
> >   * sched-deps.cc (sched_analyze_2): Ignore debug insns with CONCAT
> >   and CONCATN.
> Shouldn't we be ignoring everything in a debug insn?   I don't see why
> CONCAT/CONCATN are special here.

Debug insns are processed by insn scheduling.   I think it is to improve debug
experiences.  It is just that there are no matching usages of CONCAT/CONCATN
in non-debug insns.

--
H.J.


[PATCH] x86: Check corrupted return address when unwinding stack

2022-09-21 Thread H.J. Lu via Gcc-patches
If shadow stack is enabled, when unwinding stack, we count how many stack
frames we pop to reach the landing pad and adjust shadow stack by the same
amount.  When counting the stack frame, we compare the return address on
normal stack against the return address on shadow stack.  If they don't
match, return _URC_FATAL_PHASE2_ERROR for the corrupted return address on
normal stack.  Don't check the return address for

1. Non-catchable exception where exception_class == 0.  Process will be
terminated.
2. Zero return address which marks the outermost stack frame.
3. Signal stack frame since kernel puts a restore token on shadow stack.

* unwind-generic.h (_Unwind_Frames_Increment): Add the EXC
argument.
* unwind.inc (_Unwind_RaiseException_Phase2): Pass EXC to
_Unwind_Frames_Increment.
(_Unwind_ForcedUnwind_Phase2): Likewise.
* config/i386/shadow-stack-unwind.h (_Unwind_Frames_Increment):
Take the EXC argument.  Return _URC_FATAL_PHASE2_ERROR if the
return address on normal stack doesn't match the return address
on shadow stack.
---
 libgcc/config/i386/shadow-stack-unwind.h | 51 ++--
 libgcc/unwind-generic.h  |  2 +-
 libgcc/unwind.inc|  4 +-
 3 files changed, 50 insertions(+), 7 deletions(-)

diff --git a/libgcc/config/i386/shadow-stack-unwind.h 
b/libgcc/config/i386/shadow-stack-unwind.h
index 2b02682bdae..89d44165000 100644
--- a/libgcc/config/i386/shadow-stack-unwind.h
+++ b/libgcc/config/i386/shadow-stack-unwind.h
@@ -54,10 +54,39 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  
If not, see
aligned.  If the original shadow stack is 8 byte aligned, we just
need to pop 2 slots, one restore token, from shadow stack.  Otherwise,
we need to pop 3 slots, one restore token + 4 byte padding, from
-   shadow stack.  */
-#ifndef __x86_64__
+   shadow stack.
+
+   When popping a stack frame, we compare the return address on normal
+   stack against the return address on shadow stack.  If they don't match,
+   return _URC_FATAL_PHASE2_ERROR for the corrupted return address on
+   normal stack.  Don't check the return address for
+   1. Non-catchable exception where exception_class == 0.  Process will
+  be terminated.
+   2. Zero return address which marks the outermost stack frame.
+   3. Signal stack frame since kernel puts a restore token on shadow
+  stack.
+ */
 #undef _Unwind_Frames_Increment
-#define _Unwind_Frames_Increment(context, frames)  \
+#ifdef __x86_64__
+#define _Unwind_Frames_Increment(exc, context, frames) \
+{  \
+  frames++;\
+  if (exc->exception_class != 0\
+ && _Unwind_GetIP (context) != 0   \
+ && !_Unwind_IsSignalFrame (context))  \
+   {   \
+ _Unwind_Word ssp = _get_ssp ();   \
+ if (ssp != 0) \
+   {   \
+ ssp += 8 * frames;\
+ _Unwind_Word ra = *(_Unwind_Word *) ssp;  \
+ if (ra != _Unwind_GetIP (context))\
+   return _URC_FATAL_PHASE2_ERROR; \
+   }   \
+   }   \
+}
+#else
+#define _Unwind_Frames_Increment(exc, context, frames) \
   if (_Unwind_IsSignalFrame (context)) \
 do \
   {\
@@ -83,5 +112,19 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  
If not, see
   }\
 while (0); \
   else \
-frames++;
+{  \
+  frames++;\
+  if (exc->exception_class != 0\
+ && _Unwind_GetIP (context) != 0)  \
+   {   \
+ _Unwind_Word ssp = _get_ssp ();   \
+ if (ssp != 0) \
+   {   \
+ ssp += 4 * frames;\
+ _Unwind_Word ra = *(_Unwind_Word *) ssp;  \
+ if (ra != _Unwind_GetIP (context))\
+   return _URC_FATAL_PHASE2_ERROR; \
+   }   \
+   }   \
+}
 #endif
diff --git a/libgcc/unwind-generic.h b/libgcc/unwind-generic.h
index a87c9b3ccf6..bf721282d03 100644
--- 

[PATCH] Ignore debug insns with CONCAT and CONCATN for insn scheduling

2022-09-02 Thread H.J. Lu via Gcc-patches
CONCAT and CONCATN never appear in the insn chain.  They are only used
in debug insn.  Ignore debug insns with CONCAT and CONCATN for insn
scheduling to avoid different insn orders with and without debug insn.

gcc/

PR rtl-optimization/106746
* sched-deps.cc (sched_analyze_2): Ignore debug insns with CONCAT
and CONCATN.

gcc/testsuite/

PR rtl-optimization/106746
* gcc.dg/pr106746.c: New test.
---
 gcc/sched-deps.cc   | 14 ++
 gcc/testsuite/gcc.dg/pr106746.c | 30 ++
 2 files changed, 44 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/pr106746.c

diff --git a/gcc/sched-deps.cc b/gcc/sched-deps.cc
index 948aa0c3b60..b472e4fbb09 100644
--- a/gcc/sched-deps.cc
+++ b/gcc/sched-deps.cc
@@ -2794,6 +2794,20 @@ sched_analyze_2 (class deps_desc *deps, rtx x, rtx_insn 
*insn)
 
   return;
 
+case VAR_LOCATION:
+  if (GET_CODE (PAT_VAR_LOCATION_LOC (x)) == CONCAT
+ || GET_CODE (PAT_VAR_LOCATION_LOC (x)) == CONCATN)
+   {
+ /* CONCAT and CONCATN never appear in the insn chain.  They
+are only used in debug insn.  Ignore insns with CONCAT and
+CONCATN for insn scheduling to avoid different insn orders
+with and without debug insn.  */
+ if (cslr_p && sched_deps_info->finish_rhs)
+   sched_deps_info->finish_rhs ();
+ return;
+   }
+  break;
+
 default:
   break;
 }
diff --git a/gcc/testsuite/gcc.dg/pr106746.c b/gcc/testsuite/gcc.dg/pr106746.c
new file mode 100644
index 000..1fc29de28c3
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr106746.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-Wno-psabi -O2 -fsched2-use-superblocks -fcompare-debug" } */
+typedef char __attribute__((__vector_size__ (64))) U;
+typedef short __attribute__((__vector_size__ (64))) V;
+typedef int __attribute__((__vector_size__ (64))) W;
+
+char c;
+U a;
+U *r;
+W foo0_v512u32_0;
+
+void
+foo (W)
+{
+  U u;
+  V v;
+  W w = __builtin_shuffle (foo0_v512u32_0, foo0_v512u32_0);
+  u = __builtin_shufflevector (a, u, 3, 0, 4, 9, 9, 6,
+  7, 8, 5, 0, 6, 1, 8, 1,
+  2, 8, 6, 1, 8, 4, 9, 3,
+  8, 4, 6, 0, 9, 0, 1, 8,
+  2, 3, 3, 0, 4, 9, 9, 6,
+  7, 8, 5, 0, 6, 1, 8, 1,
+  2, 8, 6, 1, 8, 4, 9, 3,
+  8, 4, 6, 0, 9, 0, 1, 8,
+  2, 3);
+  v *= c;
+  w &= c;
+  *r = (U) v + (U) w;
+}
-- 
2.37.2



Re: [PATCH] i386: Fix conversion of move to/from AX_REG into xchg [PR106707]

2022-09-01 Thread H.J. Lu via Gcc-patches
On Thu, Sep 1, 2022 at 11:23 AM Uros Bizjak via Gcc-patches
 wrote:
>
> The conversion of a move pattern where both operands are AX_REG
> should be prevented.
>
> 2022-09-01  Uroš Bizjak  
>
> gcc/ChangeLog:
>
> PR target/106707
> * config/i386/i386.md (moves to/from AX_REG into xchg peephole2):
> Do not convert a move pattern where both operands are AX_REG.
>
> gcc/testsuite/ChangeLog:
>
> PR target/106707
> * gcc.target/i386/pr106707.c: New test.
>
> Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.
>
> Pushed to master.
>
> Uros.

I am checking in this to replace long with long long for 64-bit integer.

-- 
H.J.
From 01ca233f7a8ab683968d1ae2eb6e9f1049e86ad2 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" 
Date: Thu, 1 Sep 2022 15:18:18 -0700
Subject: [PATCH] i386: Replace long with long long for 64-bit integer

Replace long with long long for 64-bit integer since long may be 32
bits.

	PR target/106707
	* gcc.target/i386/pr106707.c (foo): Replace long with long long.
---
 gcc/testsuite/gcc.target/i386/pr106707.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr106707.c b/gcc/testsuite/gcc.target/i386/pr106707.c
index a127ccd4679..2e8ebaafb33 100644
--- a/gcc/testsuite/gcc.target/i386/pr106707.c
+++ b/gcc/testsuite/gcc.target/i386/pr106707.c
@@ -10,9 +10,9 @@ unsigned x, y;
 V v;
 
 void
-foo (long a)
+foo (long long a)
 {
-  long l = a != x;
+  long long l = a != x;
   int i = __builtin_add_overflow_p (y * ii, 0, 0);
   V u = ii < x | v, w = x <= u < i & y <= x / ii;
   v = __builtin_shufflevector (v, w, 1, 2) + (V) l;
-- 
2.37.2



[PATCH] x86: Handle V16BF in ix86_avx256_split_vector_move_misalign

2022-08-26 Thread H.J. Lu via Gcc-patches
Handle E_V16BFmode in ix86_avx256_split_vector_move_misalign and add
V16BF to V_256H iterator.

gcc/

PR target/106748
* config/i386/i386-expand.cc
(ix86_avx256_split_vector_move_misalign): Handle E_V16BFmode.
* config/i386/sse.md (V_256H): Add V16BF.

gcc/testsuite/

PR target/106748
* gcc.target/i386/pr106748.c: New test.
---
 gcc/config/i386/i386-expand.cc   |  4 
 gcc/config/i386/sse.md   |  4 ++--
 gcc/testsuite/gcc.target/i386/pr106748.c | 20 
 3 files changed, 26 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106748.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 4b216308a18..836ebc82d67 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -745,6 +745,10 @@ ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
   extract = gen_avx_vextractf128v32qi;
   mode = V16QImode;
   break;
+case E_V16BFmode:
+  extract = gen_avx_vextractf128v16bf;
+  mode = V8BFmode;
+  break;
 case E_V16HFmode:
   extract = gen_avx_vextractf128v16hf;
   mode = V8HFmode;
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index e6ab3c92dcf..259048481b6 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -297,9 +297,9 @@ (define_mode_iterator V_128
 (define_mode_iterator V_256
   [V32QI V16HI V8SI V4DI V8SF V4DF])
 
-;; All 256bit vector modes including HF vector mode
+;; All 256bit vector modes including HF/BF vector modes
 (define_mode_iterator V_256H
-  [V32QI V16HI V8SI V4DI V8SF V4DF V16HF])
+  [V32QI V16HI V8SI V4DI V8SF V4DF V16HF V16BF])
 
 ;; All 128bit and 256bit vector modes
 (define_mode_iterator V_128_256
diff --git a/gcc/testsuite/gcc.target/i386/pr106748.c 
b/gcc/testsuite/gcc.target/i386/pr106748.c
new file mode 100644
index 000..6388b1deb23
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106748.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx256-split-unaligned-store -mavx -fpack-struct" } */
+
+typedef __bf16 __m256bf16 __attribute__((__vector_size__(32)));
+typedef struct {
+  __m256bf16 _m256bf16[1];
+} YMM_T;
+
+struct {
+  YMM_T ymm0;
+} fregs;
+
+__m256bf16 do_test_u3b_0_0;
+int do_test_i;
+
+void
+do_test()
+{
+  ()[do_test_i]._m256bf16[0] = do_test_u3b_0_0;
+}
-- 
2.37.2



Re: [PATCH] x86: Cast stride to __PTRDIFF_TYPE__ in AMX intrinsics

2022-08-26 Thread H.J. Lu via Gcc-patches
On Mon, Aug 22, 2022 at 7:05 PM Hongtao Liu  wrote:
>
> On Tue, Aug 23, 2022 at 1:02 AM H.J. Lu  wrote:
> >
> > On 64-bit Windows, long is 32 bits and can't be used as stride in memory
> > operand when base is a pointer which is 64 bits.  Cast stride to
> > __PTRDIFF_TYPE__, instead of long.
> Ok.
> >
> > PR target/106714
> > * config/i386/amxtileintrin.h (_tile_loadd_internal): Cast to
> > __PTRDIFF_TYPE__.
> > (_tile_stream_loadd_internal): Likewise.
> > (_tile_stored_internal): Likewise.
> > ---
> >  gcc/config/i386/amxtileintrin.h | 6 +++---
> >  1 file changed, 3 insertions(+), 3 deletions(-)
> >
> > diff --git a/gcc/config/i386/amxtileintrin.h 
> > b/gcc/config/i386/amxtileintrin.h
> > index 7b5a39eba72..06f18aa9bfb 100644
> > --- a/gcc/config/i386/amxtileintrin.h
> > +++ b/gcc/config/i386/amxtileintrin.h
> > @@ -62,7 +62,7 @@ _tile_release (void)
> >  #define _tile_loadd_internal(dst,base,stride)  \
> >__asm__ volatile \
> >("{tileloadd\t(%0,%1,1), %%tmm"#dst"|tileloadd\t%%tmm"#dst", [%0+%1*1]}" 
> > \
> > -   :: "r" ((const void*) (base)), "r" ((long) (stride)))
> > +   :: "r" ((const void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)))
> >
> >  #define _tile_stream_loadd(dst,base,stride)\
> >_tile_stream_loadd_internal (dst, base, stride)
> > @@ -70,7 +70,7 @@ _tile_release (void)
> >  #define _tile_stream_loadd_internal(dst,base,stride)   \
> >__asm__ volatile \
> >("{tileloaddt1\t(%0,%1,1), %%tmm"#dst"|tileloaddt1\t%%tmm"#dst", 
> > [%0+%1*1]}" \
> > -   :: "r" ((const void*) (base)), "r" ((long) (stride)))
> > +   :: "r" ((const void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)))
> >
> >  #define _tile_stored(dst,base,stride)  \
> >_tile_stored_internal (dst, base, stride)
> > @@ -78,7 +78,7 @@ _tile_release (void)
> >  #define _tile_stored_internal(src,base,stride) \
> >__asm__ volatile \
> >("{tilestored\t%%tmm"#src", (%0,%1,1)|tilestored\t[%0+%1*1], 
> > %%tmm"#src"}" \
> > -   :: "r" ((void*) (base)), "r" ((long) (stride)) \
> > +   :: "r" ((void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)) \
> > : "memory")
> >
> >  #define _tile_zero(dst)\
> > --
> > 2.37.2
> >
>
>
> --
> BR,
> Hongtao

OK to backport it to GCC 12 branch?


-- 
H.J.


[PATCH] x86: Replace vmovdqu with movdqu in BF16 XMM ABI tests

2022-08-23 Thread H.J. Lu via Gcc-patches
I am checking in this as an obvious fix.

H.J.
---
Since XMM BF16 tests only require SSE2, replace vmovdqu with movdqu in
BF16 XMM ABI tests to support SSE2 machines without AVX.

Tested on x86-64 machines with and without AVX.

* gcc.target/x86_64/abi/bf16/asm-support.S: Replace vmovdqu with
movdqu.
---
 .../gcc.target/x86_64/abi/bf16/asm-support.S  | 36 +--
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/gcc/testsuite/gcc.target/x86_64/abi/bf16/asm-support.S 
b/gcc/testsuite/gcc.target/x86_64/abi/bf16/asm-support.S
index a8165d86317..7559aa910c4 100644
--- a/gcc/testsuite/gcc.target/x86_64/abi/bf16/asm-support.S
+++ b/gcc/testsuite/gcc.target/x86_64/abi/bf16/asm-support.S
@@ -20,22 +20,22 @@ snapshot:
movq%r13, r13(%rip)
movq%r14, r14(%rip)
movq%r15, r15(%rip)
-   vmovdqu %xmm0, xmm_regs+0(%rip)
-   vmovdqu %xmm1, xmm_regs+16(%rip)
-   vmovdqu %xmm2, xmm_regs+32(%rip)
-   vmovdqu %xmm3, xmm_regs+48(%rip)
-   vmovdqu %xmm4, xmm_regs+64(%rip)
-   vmovdqu %xmm5, xmm_regs+80(%rip)
-   vmovdqu %xmm6, xmm_regs+96(%rip)
-   vmovdqu %xmm7, xmm_regs+112(%rip)
-   vmovdqu %xmm8, xmm_regs+128(%rip)
-   vmovdqu %xmm9, xmm_regs+144(%rip)
-   vmovdqu %xmm10, xmm_regs+160(%rip)
-   vmovdqu %xmm11, xmm_regs+176(%rip)
-   vmovdqu %xmm12, xmm_regs+192(%rip)
-   vmovdqu %xmm13, xmm_regs+208(%rip)
-   vmovdqu %xmm14, xmm_regs+224(%rip)
-   vmovdqu %xmm15, xmm_regs+240(%rip)
+   movdqu  %xmm0, xmm_regs+0(%rip)
+   movdqu  %xmm1, xmm_regs+16(%rip)
+   movdqu  %xmm2, xmm_regs+32(%rip)
+   movdqu  %xmm3, xmm_regs+48(%rip)
+   movdqu  %xmm4, xmm_regs+64(%rip)
+   movdqu  %xmm5, xmm_regs+80(%rip)
+   movdqu  %xmm6, xmm_regs+96(%rip)
+   movdqu  %xmm7, xmm_regs+112(%rip)
+   movdqu  %xmm8, xmm_regs+128(%rip)
+   movdqu  %xmm9, xmm_regs+144(%rip)
+   movdqu  %xmm10, xmm_regs+160(%rip)
+   movdqu  %xmm11, xmm_regs+176(%rip)
+   movdqu  %xmm12, xmm_regs+192(%rip)
+   movdqu  %xmm13, xmm_regs+208(%rip)
+   movdqu  %xmm14, xmm_regs+224(%rip)
+   movdqu  %xmm15, xmm_regs+240(%rip)
jmp *callthis(%rip)
 .LFE3:
.size   snapshot, .-snapshot
@@ -50,8 +50,8 @@ snapshot_ret:
addq$8, %rsp
movq%rax, rax(%rip)
movq%rdx, rdx(%rip)
-   vmovdqu %xmm0, xmm_regs+0(%rip)
-   vmovdqu %xmm1, xmm_regs+16(%rip)
+   movdqu  %xmm0, xmm_regs+0(%rip)
+   movdqu  %xmm1, xmm_regs+16(%rip)
fstpt   x87_regs(%rip)
fstpt   x87_regs+16(%rip)
fldtx87_regs+16(%rip)
-- 
2.37.2



[PATCH] x86: Cast stride to __PTRDIFF_TYPE__ in AMX intrinsics

2022-08-22 Thread H.J. Lu via Gcc-patches
On 64-bit Windows, long is 32 bits and can't be used as stride in memory
operand when base is a pointer which is 64 bits.  Cast stride to
__PTRDIFF_TYPE__, instead of long.

PR target/106714
* config/i386/amxtileintrin.h (_tile_loadd_internal): Cast to
__PTRDIFF_TYPE__.
(_tile_stream_loadd_internal): Likewise.
(_tile_stored_internal): Likewise.
---
 gcc/config/i386/amxtileintrin.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/config/i386/amxtileintrin.h b/gcc/config/i386/amxtileintrin.h
index 7b5a39eba72..06f18aa9bfb 100644
--- a/gcc/config/i386/amxtileintrin.h
+++ b/gcc/config/i386/amxtileintrin.h
@@ -62,7 +62,7 @@ _tile_release (void)
 #define _tile_loadd_internal(dst,base,stride)  \
   __asm__ volatile \
   ("{tileloadd\t(%0,%1,1), %%tmm"#dst"|tileloadd\t%%tmm"#dst", [%0+%1*1]}" \
-   :: "r" ((const void*) (base)), "r" ((long) (stride)))
+   :: "r" ((const void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)))
 
 #define _tile_stream_loadd(dst,base,stride)\
   _tile_stream_loadd_internal (dst, base, stride)
@@ -70,7 +70,7 @@ _tile_release (void)
 #define _tile_stream_loadd_internal(dst,base,stride)   \
   __asm__ volatile \
   ("{tileloaddt1\t(%0,%1,1), %%tmm"#dst"|tileloaddt1\t%%tmm"#dst", [%0+%1*1]}" 
\
-   :: "r" ((const void*) (base)), "r" ((long) (stride)))
+   :: "r" ((const void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)))
 
 #define _tile_stored(dst,base,stride)  \
   _tile_stored_internal (dst, base, stride)
@@ -78,7 +78,7 @@ _tile_release (void)
 #define _tile_stored_internal(src,base,stride) \
   __asm__ volatile \
   ("{tilestored\t%%tmm"#src", (%0,%1,1)|tilestored\t[%0+%1*1], %%tmm"#src"}" \
-   :: "r" ((void*) (base)), "r" ((long) (stride)) \
+   :: "r" ((void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)) \
: "memory")
 
 #define _tile_zero(dst)\
-- 
2.37.2



Re: [PATCH] Add ABI test for __bf16 type

2022-08-19 Thread H.J. Lu via Gcc-patches
On Thu, Aug 18, 2022 at 5:56 PM Hongtao Liu via Gcc-patches
 wrote:
>
> On Thu, Aug 18, 2022 at 3:36 PM Haochen Jiang via Gcc-patches
>  wrote:
> >
> > Hi all,
> >
> > This patch aims to add bf16 abi test after the whole __bf16 type is added.
> >
> > Regtested on x86_64-pc-linux-gnu. Ok for trunk?
> Ok.

All BF16 ABI tests failed due to missing __m128bf16/__m256bf16/__m512bf16.
When will __bf16 types be added?

> >
> > BRs,
> > Haochen
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/x86_64/abi/bf16/abi-bf16.exp: New test.
> > * gcc.target/x86_64/abi/bf16/args.h: Ditto.
> > * gcc.target/x86_64/abi/bf16/asm-support.S: Ditto.
> > * gcc.target/x86_64/abi/bf16/bf16-check.h: Ditto.
> > * gcc.target/x86_64/abi/bf16/bf16-helper.h: Ditto.
> > * gcc.target/x86_64/abi/bf16/defines.h: Ditto.
> > * gcc.target/x86_64/abi/bf16/m256bf16/abi-bf16-ymm.exp: Ditto.
> > * gcc.target/x86_64/abi/bf16/m256bf16/args.h: Ditto.
> > * gcc.target/x86_64/abi/bf16/m256bf16/asm-support.S: Ditto.
> > * gcc.target/x86_64/abi/bf16/m256bf16/bf16-ymm-check.h: Ditto.
> > * gcc.target/x86_64/abi/bf16/m256bf16/test_m256_returning.c: Ditto.
> > * gcc.target/x86_64/abi/bf16/m256bf16/test_passing_m256.c: Ditto.
> > * gcc.target/x86_64/abi/bf16/m256bf16/test_passing_structs.c: Ditto.
> > * gcc.target/x86_64/abi/bf16/m256bf16/test_passing_unions.c: Ditto.
> > * gcc.target/x86_64/abi/bf16/m256bf16/test_varargs-m256.c: Ditto.
> > * gcc.target/x86_64/abi/bf16/m512bf16/abi-bf16-zmm.exp: Ditto.
> > * gcc.target/x86_64/abi/bf16/m512bf16/args.h: Ditto.
> > * gcc.target/x86_64/abi/bf16/m512bf16/asm-support.S: Ditto.
> > * gcc.target/x86_64/abi/bf16/m512bf16/bf16-zmm-check.h: Ditto.
> > * gcc.target/x86_64/abi/bf16/m512bf16/test_m512_returning.c: Ditto.
> > * gcc.target/x86_64/abi/bf16/m512bf16/test_passing_m512.c: Ditto.
> > * gcc.target/x86_64/abi/bf16/m512bf16/test_passing_structs.c: Ditto.
> > * gcc.target/x86_64/abi/bf16/m512bf16/test_passing_unions.c: Ditto.
> > * gcc.target/x86_64/abi/bf16/m512bf16/test_varargs-m512.c: Ditto.
> > * gcc.target/x86_64/abi/bf16/macros.h: Ditto.
> > * gcc.target/x86_64/abi/bf16/test_3_element_struct_and_unions.c: 
> > Ditto.
> > * gcc.target/x86_64/abi/bf16/test_basic_alignment.c: Ditto.
> > * gcc.target/x86_64/abi/bf16/test_basic_array_size_and_align.c: 
> > Ditto.
> > * gcc.target/x86_64/abi/bf16/test_basic_returning.c: Ditto.
> > * gcc.target/x86_64/abi/bf16/test_basic_sizes.c: Ditto.
> > * gcc.target/x86_64/abi/bf16/test_basic_struct_size_and_align.c: 
> > Ditto.
> > * gcc.target/x86_64/abi/bf16/test_basic_union_size_and_align.c: 
> > Ditto.
> > * gcc.target/x86_64/abi/bf16/test_m128_returning.c: Ditto.
> > * gcc.target/x86_64/abi/bf16/test_passing_floats.c: Ditto.
> > * gcc.target/x86_64/abi/bf16/test_passing_m128.c: Ditto.
> > * gcc.target/x86_64/abi/bf16/test_passing_structs.c: Ditto.
> > * gcc.target/x86_64/abi/bf16/test_passing_unions.c: Ditto.
> > * gcc.target/x86_64/abi/bf16/test_struct_returning.c: Ditto.
> > * gcc.target/x86_64/abi/bf16/test_varargs-m128.c: Ditto.



-- 
H.J.


Re: [PATCH] stack-protector: Check stack canary for noreturn function

2022-08-17 Thread H.J. Lu via Gcc-patches
On Wed, Aug 3, 2022 at 10:27 AM H.J. Lu  wrote:
>
> On Tue, Aug 2, 2022 at 4:34 PM Jeff Law  wrote:
> >
> >
> >
> > On 8/2/2022 11:43 AM, H.J. Lu wrote:
> > > On Sat, Jul 30, 2022 at 1:30 PM Jeff Law via Gcc-patches
> > >  wrote:
> > >>
> > >>
> > >> On 7/14/2022 3:55 PM, H.J. Lu via Gcc-patches wrote:
> > >>> Check stack canary for noreturn function to catch stack corruption
> > >>> before calling noreturn function.  For C++, check stack canary when
> > >>> throwing exception or resuming stack unwind to avoid corrupted stack.
> > >>>
> > >>> gcc/
> > >>>
> > >>>PR middle-end/58245
> > >>>* calls.cc (expand_call): Check stack canary for noreturn
> > >>>function.
> > >>>
> > >>> gcc/testsuite/
> > >>>
> > >>>PR middle-end/58245
> > >>>* c-c++-common/pr58245-1.c: New test.
> > >>>* g++.dg/pr58245-1.C: Likewise.
> > >>>* g++.dg/fstack-protector-strong.C: Adjusted.
> > >> But is this really something we want?   I'd actually lean towards
> > >> eliminating the useless load -- I don't necessarily think we should be
> > >> treating non-returning paths specially here.
> > >>
> > >> The whole point of the stack protector is to prevent the *return* path
> > >> from going to an attacker controlled location.  I'm not sure checking
> > >> the protector at this point actually does anything particularly useful.
> > > throw is marked no return.   Since the unwind library may read
> > > the stack contents to unwind stack, it the stack is corrupted, the
> > > exception handling may go wrong.   Should we handle this case?
> > That's the question I think we need to answer.  The EH paths are a known
> > security issue on Windows and while ours are notably different I'm not
> > sure if there's a real attack surface in those paths.  My sense is that
> > if we need to tackle this that doing so on the throw side might be
> > better as it's closer conceptually to when//how we check the canary for
> > a normal return.
>
> Like this?
>
> @@ -3154,7 +3155,10 @@ expand_call (tree exp, rtx target, int ignore)
>if (pass && (flags & ECF_MALLOC))
>start_sequence ();
>
> -  if (pass == 0
> +  /* Check the canary value for sibcall or function which doesn't
> +   return and could throw.  */
> +  if ((pass == 0
> + || ((flags & ECF_NORETURN) != 0 && tree_could_throw_p (exp)))
>  && crtl->stack_protect_guard
>  && targetm.stack_protect_runtime_enabled_p ())
>stack_protect_epilogue ();

Here is the patch:

https://gcc.gnu.org/pipermail/gcc-patches/2022-August/599916.html

> > jeff
> > >
> > >   --
> > > H.J.
> >
>
>
> --
> H.J.



-- 
H.J.


[PATCH v2] stack-protector: Check stack canary before throwing exception

2022-08-17 Thread H.J. Lu via Gcc-patches
Check stack canary before throwing exception to avoid stack corruption.

gcc/

PR middle-end/58245
* calls.cc: Include "tree-eh.h".
(expand_call): Check stack canary before throwing exception.

gcc/testsuite/

PR middle-end/58245
* g++.dg/fstack-protector-strong.C: Adjusted.
* g++.dg/pr58245-1.C: New test.
---
 gcc/calls.cc   |  6 +-
 gcc/testsuite/g++.dg/fstack-protector-strong.C |  2 +-
 gcc/testsuite/g++.dg/pr58245-1.C   | 10 ++
 3 files changed, 16 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/pr58245-1.C

diff --git a/gcc/calls.cc b/gcc/calls.cc
index bc96aff38f0..6dd6f73e978 100644
--- a/gcc/calls.cc
+++ b/gcc/calls.cc
@@ -60,6 +60,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "attr-fnspec.h"
 #include "value-query.h"
 #include "tree-pretty-print.h"
+#include "tree-eh.h"
 
 /* Like PREFERRED_STACK_BOUNDARY but in units of bytes, not bits.  */
 #define STACK_BYTES (PREFERRED_STACK_BOUNDARY / BITS_PER_UNIT)
@@ -3154,7 +3155,10 @@ expand_call (tree exp, rtx target, int ignore)
   if (pass && (flags & ECF_MALLOC))
start_sequence ();
 
-  if (pass == 0
+  /* Check the canary value for sibcall or function which doesn't
+return and could throw.  */
+  if ((pass == 0
+  || ((flags & ECF_NORETURN) != 0 && tree_could_throw_p (exp)))
  && crtl->stack_protect_guard
  && targetm.stack_protect_runtime_enabled_p ())
stack_protect_epilogue ();
diff --git a/gcc/testsuite/g++.dg/fstack-protector-strong.C 
b/gcc/testsuite/g++.dg/fstack-protector-strong.C
index ae6d2fdb8df..034af2ce9ab 100644
--- a/gcc/testsuite/g++.dg/fstack-protector-strong.C
+++ b/gcc/testsuite/g++.dg/fstack-protector-strong.C
@@ -85,4 +85,4 @@ int foo7 (B *p)
   return p->return_slot ().a1;
 }
 
-/* { dg-final { scan-assembler-times "stack_chk_fail" 7 } } */
+/* { dg-final { scan-assembler-times "stack_chk_fail" 8 } } */
diff --git a/gcc/testsuite/g++.dg/pr58245-1.C b/gcc/testsuite/g++.dg/pr58245-1.C
new file mode 100644
index 000..1439bc62e71
--- /dev/null
+++ b/gcc/testsuite/g++.dg/pr58245-1.C
@@ -0,0 +1,10 @@
+/* { dg-do compile { target i?86-*-* x86_64-*-* rs6000-*-* s390x-*-* } } */
+/* { dg-options "-O2 -fstack-protector-all" } */
+
+void
+bar (void)
+{
+  throw 1;
+}
+
+/* { dg-final { scan-assembler-times "stack_chk_fail" 1 } } */
-- 
2.37.2



Re: [PATCH 2/2]middle-end: Support recognition of three-way max/min.

2022-08-03 Thread H.J. Lu via Gcc-patches
On Wed, Aug 3, 2022 at 1:26 AM Richard Biener via Gcc-patches
 wrote:
>
> On Wed, 3 Aug 2022, Tamar Christina wrote:
>
> >
> > > -Original Message-
> > > From: Richard Biener 
> > > Sent: Tuesday, August 2, 2022 10:11 AM
> > > To: Tamar Christina 
> > > Cc: Richard Biener ; ja...@redhat.com; nd
> > > ; gcc-patches@gcc.gnu.org
> > > Subject: Re: [PATCH 2/2]middle-end: Support recognition of three-way
> > > max/min.
> > >
> > > On Tue, Aug 2, 2022 at 10:33 AM Tamar Christina via Gcc-patches  > > patc...@gcc.gnu.org> wrote:
> > > >
> > > > > > > > When this function replaces the edge it doesn't seem to update
> > > > > > > > the
> > > > > > > dominators.
> > > > > > > > Since It's replacing the middle BB we then end up with an
> > > > > > > > error
> > > > > > > >
> > > > > > > > gcc/testsuite/gcc.dg/tree-ssa/minmax-14.c:17:1: error:
> > > > > > > > dominator of 5 should be 4, not 2
> > > > > > > >
> > > > > > > > during early verify. So instead, I replace the BB but defer
> > > > > > > > its deletion until cleanup which removes it and updates the
> > > dominators.
> > > > > > >
> > > > > > > Hmm, for a diamond shouldn't you replace
> > > > > > >
> > > > > > >   if (EDGE_SUCC (cond_block, 0)->dest == bb)
> > > > > > > edge_to_remove = EDGE_SUCC (cond_block, 1);
> > > > > > >   else
> > > > > > > edge_to_remove = EDGE_SUCC (cond_block, 0);
> > > > > > >
> > > > > > > with
> > > > > > >
> > > > > > >   if (EDGE_SUCC (cond_block, 0)->dest == bb)
> > > > > > > edge_to_remove = EDGE_SUCC (cond_block, 1);
> > > > > > >   else if (EDGE_SUCC (cond_block, 1)->dest == bb)
> > > > > > > edge_to_remove = EDGE_SUCC (cond_block, 0);
> > > > > > >
> > > > > > > thus, the code expects to be left with a fallthru to the PHI
> > > > > > > block which is expected to have the immediate dominator being
> > > > > > > cond_block but with a diamond there's a (possibly empty) block
> > > > > > > inbetween and dominators are wrong.
> > > > > >
> > > > > > Agreed, but the (EDGE_SUCC (cond_block, 1)->dest == bb) doesn't
> > > > > > seem like the Right one since for a diamond there will be a block
> > > > > > in between the two.  Did you perhaps mean  EDGE_SUCC (EDGE_SUCC
> > > > > > (cond_block, 1)->dest, 0)->dest == bb? i.e. that that destination
> > > > > > across the
> > > > > diamond be bb, and then you remove the middle block?
> > > > >
> > > > > Hmm, I think my condition was correct - the code tries to remove the
> > > > > edge to the middle-block and checks the remaining edge falls through
> > > > > to the merge block.  With a true diamond there is no fallthru to the
> > > > > merge block to keep so we better don't remove any edge?
> > > > >
> > > > > > For the minmax diamond we want both edges removed, since all the
> > > > > > code in the middle BBs are now dead.  But this is probably not
> > > > > > true in the general
> > > > > sense.
> > > >
> > > > Ah! Sorry I was firing a few cylinders short, I get what you mean now:
> > > >
> > > > @@ -425,8 +439,19 @@ replace_phi_edge_with_variable (basic_block
> > > cond_block,
> > > >edge edge_to_remove;
> > > >if (EDGE_SUCC (cond_block, 0)->dest == bb)
> > > >  edge_to_remove = EDGE_SUCC (cond_block, 1);
> > > > -  else
> > > > +  else if (EDGE_SUCC (cond_block, 1)->dest == bb)
> > > >  edge_to_remove = EDGE_SUCC (cond_block, 0);
> > > > +  else
> > > > +{
> > > > +  /* If neither edge from the conditional is the final bb
> > > > +then we must have a diamond block, in which case
> > > > +the true edge was changed by SET_USE above and we must
> > > > +mark the other edge as the false edge.  */
> > > > +  gcond *cond = as_a  (last_stmt (cond_block));
> > > > +  gimple_cond_make_false (cond);
> > > > +  return;
> > > > +}
> > > > +
> > >
> > > Note there is already
> > >
> > >   if (EDGE_COUNT (edge_to_remove->dest->preds) == 1)
> > > {
> > > ...
> > > }
> > >   else
> > > {
> > >   /* If there are other edges into the middle block make
> > >  CFG cleanup deal with the edge removal to avoid
> > >  updating dominators here in a non-trivial way.  */
> > >   gcond *cond = as_a  (last_stmt (cond_block));
> > >   if (edge_to_remove->flags & EDGE_TRUE_VALUE)
> > > gimple_cond_make_false (cond);
> > >   else
> > > gimple_cond_make_true (cond);
> > > }
> > >
> > > I'm not sure how you can say 'e' is always the true edge?  May I suggest 
> > > to
> > > amend the first condition with edge_to_remove && (and initialize that to
> > > NULL) and use e->flags instead of edge_to_remove in the else, of course
> > > also inverting the logic since we're keeping 'e'?
> >
> > As discussed on IRC, here's the version using keep_edge:
> >
> > @@ -422,12 +436,17 @@ replace_phi_edge_with_variable (basic_block 
> > cond_block,
> >SET_USE (PHI_ARG_DEF_PTR (phi, e->dest_idx), new_tree);
> >
> >/* Remove the empty basic block.  */
> > -  edge edge_to_remove;
> 

Re: [PATCH] stack-protector: Check stack canary for noreturn function

2022-08-03 Thread H.J. Lu via Gcc-patches
On Tue, Aug 2, 2022 at 4:34 PM Jeff Law  wrote:
>
>
>
> On 8/2/2022 11:43 AM, H.J. Lu wrote:
> > On Sat, Jul 30, 2022 at 1:30 PM Jeff Law via Gcc-patches
> >  wrote:
> >>
> >>
> >> On 7/14/2022 3:55 PM, H.J. Lu via Gcc-patches wrote:
> >>> Check stack canary for noreturn function to catch stack corruption
> >>> before calling noreturn function.  For C++, check stack canary when
> >>> throwing exception or resuming stack unwind to avoid corrupted stack.
> >>>
> >>> gcc/
> >>>
> >>>PR middle-end/58245
> >>>* calls.cc (expand_call): Check stack canary for noreturn
> >>>function.
> >>>
> >>> gcc/testsuite/
> >>>
> >>>PR middle-end/58245
> >>>* c-c++-common/pr58245-1.c: New test.
> >>>* g++.dg/pr58245-1.C: Likewise.
> >>>* g++.dg/fstack-protector-strong.C: Adjusted.
> >> But is this really something we want?   I'd actually lean towards
> >> eliminating the useless load -- I don't necessarily think we should be
> >> treating non-returning paths specially here.
> >>
> >> The whole point of the stack protector is to prevent the *return* path
> >> from going to an attacker controlled location.  I'm not sure checking
> >> the protector at this point actually does anything particularly useful.
> > throw is marked no return.   Since the unwind library may read
> > the stack contents to unwind stack, it the stack is corrupted, the
> > exception handling may go wrong.   Should we handle this case?
> That's the question I think we need to answer.  The EH paths are a known
> security issue on Windows and while ours are notably different I'm not
> sure if there's a real attack surface in those paths.  My sense is that
> if we need to tackle this that doing so on the throw side might be
> better as it's closer conceptually to when//how we check the canary for
> a normal return.

Like this?

@@ -3154,7 +3155,10 @@ expand_call (tree exp, rtx target, int ignore)
   if (pass && (flags & ECF_MALLOC))
   start_sequence ();

-  if (pass == 0
+  /* Check the canary value for sibcall or function which doesn't
+   return and could throw.  */
+  if ((pass == 0
+ || ((flags & ECF_NORETURN) != 0 && tree_could_throw_p (exp)))
 && crtl->stack_protect_guard
 && targetm.stack_protect_runtime_enabled_p ())
   stack_protect_epilogue ();

> jeff
> >
> >   --
> > H.J.
>


-- 
H.J.


Re: [PATCH] stack-protector: Check stack canary for noreturn function

2022-08-02 Thread H.J. Lu via Gcc-patches
On Sat, Jul 30, 2022 at 1:30 PM Jeff Law via Gcc-patches
 wrote:
>
>
>
> On 7/14/2022 3:55 PM, H.J. Lu via Gcc-patches wrote:
> > Check stack canary for noreturn function to catch stack corruption
> > before calling noreturn function.  For C++, check stack canary when
> > throwing exception or resuming stack unwind to avoid corrupted stack.
> >
> > gcc/
> >
> >   PR middle-end/58245
> >   * calls.cc (expand_call): Check stack canary for noreturn
> >   function.
> >
> > gcc/testsuite/
> >
> >   PR middle-end/58245
> >   * c-c++-common/pr58245-1.c: New test.
> >   * g++.dg/pr58245-1.C: Likewise.
> >   * g++.dg/fstack-protector-strong.C: Adjusted.
> But is this really something we want?   I'd actually lean towards
> eliminating the useless load -- I don't necessarily think we should be
> treating non-returning paths specially here.
>
> The whole point of the stack protector is to prevent the *return* path
> from going to an attacker controlled location.  I'm not sure checking
> the protector at this point actually does anything particularly useful.

throw is marked no return.   Since the unwind library may read
the stack contents to unwind stack, it the stack is corrupted, the
exception handling may go wrong.   Should we handle this case?

 --
H.J.


Re: [PATCH] [PR83782] i386 PIE: avoid @GOTOFF for ifuncs and their aliases

2022-08-01 Thread H.J. Lu via Gcc-patches
On Thu, Jul 28, 2022 at 9:31 AM H.J. Lu  wrote:
>
> On Thu, Jul 28, 2022 at 1:26 AM Alexandre Oliva  wrote:
> >
> > On Jul 27, 2022, "H.J. Lu"  wrote:
> >
> > > On Tue, Jul 26, 2022 at 10:14 PM Alexandre Oliva  
> > > wrote:
> >
> > >> The use of @GOTOFF for locally-bound but externally-visible symbols
> > >> (e.g. protected visibility) also breaks pointer identity if the
> > >> canonical address ends up preempted by a PLT entry.
> >
> > > Here is a different fix:
> >
> > > https://gcc.gnu.org/pipermail/gcc-patches/2022-July/598667.html
> >
> > Oh, thanks, I'd missed that.
> >
> > It doesn't seem to fix the part of the problem I quoted above, though.
> > I think fixing that requires testing the visibility, to make sure the
> > symbol's canonical address cannot be preempted, which may occur with
> > local binding, if the symbol is protected and referenced in the main
> > program, otherwise pointer identity is broken again, admittedly for a
> > more obscure case, but pointer identity was the point of the PR.
>
> The protected symbol issue isn't IFUNC specific.   The protected
> symbol handling is changed in glibc 2.36 and binutils 2.39.   Only
> the address of the protected symbol definition should be used as
> its address.
>
> > >> * config/i386/i386.cc (ix86_call_use_plt_p): Follow the alias
> > >> chain looking for an ifunc, as in gcc.target/i386/mvc10.c.
> >
> > You may also need to do something like this bit for mvc10.c on ia32 PIE.
> > Because the ifunc is called through an alias, AFAICT we don't even
> > notice that the call target is (an alias to) an ifunc.  GCC's
> > gotoff_operand predicate accepts it, but binutils (the linker, IIRC)
> > then rejects that reference, because the named symbol is an alias to an
> > ifunc.
>
> Yes, this change is needed.

I think this fix should be applied to default_binds_local_p_3:

  if (lookup_attribute ("weakref", DECL_ATTRIBUTES (exp))
  || (!targetm.ifunc_ref_local_ok ()
  && TREE_CODE (exp) == FUNCTION_DECL
  && cgraph_node::get (exp)
  && cgraph_node::get (exp)->ifunc_resolver))
return false;

since the ifunc_resolver check won't work on aliases.

-- 
H.J.


Re: [x86_64 PATCH] PR target/106450: Tweak timode_remove_non_convertible_regs.

2022-07-28 Thread H.J. Lu via Gcc-patches
On Thu, Jul 28, 2022 at 9:43 AM Roger Sayle  wrote:
>
>
> This patch resolves PR target/106450, some more fall-out from more
> aggressive TImode scalar-to-vector (STV) optimizations.  I continue
> to be caught out by how far TImode STV has diverged from DImode/SImode
> STV, and therefore requires additional (unexpected) tweaking.  Many
> thanks to H.J. Lu for pointing out timode_remove_non_convertible_regs
> needs to be extended to handle XOR (and other new operations).
>
> Unhelpfully the comment above this function states that it's the TImode
> version of "remove_non_convertible_regs", which doesn't exist anymore,
> so I've resurrected an explanatory comment from the git history.
> By refactoring the checks for hard regs and already "marked" regs
> into timode_check_non_convertible_regs itself, all its callers are
> simplified.  This patch then uses GET_RTX_CLASS to generically handle
> unary and binary operations, calling timode_check_non_convertible_regs
> on each TImode register operand in the single_set's SET_SRC.
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check, both with and without --target_board=unix{-m32},
> with no new failures.  Ok for mainline?
>
>
> 2022-07-28  Roger Sayle  
>
> gcc/ChangeLog
> PR target/106450
> * config/i386/i386-features.cc (timode_check_non_convertible_regs):
> Do nothing if REGNO is set in the REGS bitmap, or is a hard reg.
> (timode_remove_non_convertible_regs): Update comment.
> Call timode_check_non_convertible_regs on all register operands
> of supported (binary and unary) operations.

Should we use

df_ref ref;
FOR_EACH_INSN_USE (ref, insn)
   if (!DF_REF_REG_MEM_P (ref))
 timode_check_non_convertible_regs (candidates, regs,
  DF_REF_REGNO (ref));

to check each use?

> gcc/testsuite/ChangeLog
> PR target/106450
> * gcc.target/i386/pr106450.c: New test case.
>
>
> Thanks in advance,
> Roger
> --
>


-- 
H.J.


Re: [PATCH] [PR83782] i386 PIE: avoid @GOTOFF for ifuncs and their aliases

2022-07-28 Thread H.J. Lu via Gcc-patches
On Thu, Jul 28, 2022 at 1:26 AM Alexandre Oliva  wrote:
>
> On Jul 27, 2022, "H.J. Lu"  wrote:
>
> > On Tue, Jul 26, 2022 at 10:14 PM Alexandre Oliva  wrote:
>
> >> The use of @GOTOFF for locally-bound but externally-visible symbols
> >> (e.g. protected visibility) also breaks pointer identity if the
> >> canonical address ends up preempted by a PLT entry.
>
> > Here is a different fix:
>
> > https://gcc.gnu.org/pipermail/gcc-patches/2022-July/598667.html
>
> Oh, thanks, I'd missed that.
>
> It doesn't seem to fix the part of the problem I quoted above, though.
> I think fixing that requires testing the visibility, to make sure the
> symbol's canonical address cannot be preempted, which may occur with
> local binding, if the symbol is protected and referenced in the main
> program, otherwise pointer identity is broken again, admittedly for a
> more obscure case, but pointer identity was the point of the PR.

The protected symbol issue isn't IFUNC specific.   The protected
symbol handling is changed in glibc 2.36 and binutils 2.39.   Only
the address of the protected symbol definition should be used as
its address.

> >> * config/i386/i386.cc (ix86_call_use_plt_p): Follow the alias
> >> chain looking for an ifunc, as in gcc.target/i386/mvc10.c.
>
> You may also need to do something like this bit for mvc10.c on ia32 PIE.
> Because the ifunc is called through an alias, AFAICT we don't even
> notice that the call target is (an alias to) an ifunc.  GCC's
> gotoff_operand predicate accepts it, but binutils (the linker, IIRC)
> then rejects that reference, because the named symbol is an alias to an
> ifunc.

Yes, this change is needed.

>
> --
> Alexandre Oliva, happy hackerhttps://FSFLA.org/blogs/lxo/
>Free Software Activist   GNU Toolchain Engineer
> Disinformation flourishes because many people care deeply about injustice
> but very few check the facts.  Ask me about 



-- 
H.J.


PING [PATCH] x86: Add ix86_ifunc_ref_local_ok

2022-07-27 Thread H.J. Lu via Gcc-patches
On Thu, Jul 21, 2022 at 11:53 AM H.J. Lu  wrote:
>
> We can't always use the PLT entry as the function address for local IFUNC
> functions.  When the PIC register is needed for PLT call, indirect call
> via the PLT entry will fail since the PIC register may not be set up
> properly for indirect call.  Add ix86_ifunc_ref_local_ok to return false
> when the PLT entry can't be used as local IFUNC function pointers.
>
> gcc/
>
> PR target/83782
> * config/i386/i386.cc (ix86_ifunc_ref_local_ok): New.
> (TARGET_IFUNC_REF_LOCAL_OK): Use it.
>
> gcc/testsuite/
>
> PR target/83782
> * gcc.target/i386/pr83782-1.c: Require non-ia32.
> * gcc.target/i386/pr83782-2.c: Likewise.
> * gcc.target/i386/pr83782-3.c: New test.
> ---
>  gcc/config/i386/i386.cc   | 15 ++-
>  gcc/testsuite/gcc.target/i386/pr83782-1.c |  8 +++---
>  gcc/testsuite/gcc.target/i386/pr83782-2.c |  4 +--
>  gcc/testsuite/gcc.target/i386/pr83782-3.c | 32 +++
>  4 files changed, 50 insertions(+), 9 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr83782-3.c
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index e03f86d4a23..5e30dc884bf 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -16070,6 +16070,19 @@ ix86_call_use_plt_p (rtx call_op)
>return true;
>  }
>
> +/* Implement TARGET_IFUNC_REF_LOCAL_OK.  If this hook returns true,
> +   the PLT entry will be used as the function address for local IFUNC
> +   functions.  When the PIC register is needed for PLT call, indirect
> +   call via the PLT entry will fail since the PIC register may not be
> +   set up properly for indirect call.  In this case, we should return
> +   false.  */
> +
> +static bool
> +ix86_ifunc_ref_local_ok (void)
> +{
> +  return !flag_pic || (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC);
> +}
> +
>  /* Return true if the function being called was marked with attribute
> "noplt" or using -fno-plt and we are compiling for non-PIC.  We need
> to handle the non-PIC case in the backend because there is no easy
> @@ -24953,7 +24966,7 @@ ix86_libgcc_floating_mode_supported_p
>ix86_get_multilib_abi_name
>
>  #undef TARGET_IFUNC_REF_LOCAL_OK
> -#define TARGET_IFUNC_REF_LOCAL_OK hook_bool_void_true
> +#define TARGET_IFUNC_REF_LOCAL_OK ix86_ifunc_ref_local_ok
>
>  #if !TARGET_MACHO && !TARGET_DLLIMPORT_DECL_ATTRIBUTES
>  # undef TARGET_ASM_RELOC_RW_MASK
> diff --git a/gcc/testsuite/gcc.target/i386/pr83782-1.c 
> b/gcc/testsuite/gcc.target/i386/pr83782-1.c
> index ce97b12e65d..85674346aec 100644
> --- a/gcc/testsuite/gcc.target/i386/pr83782-1.c
> +++ b/gcc/testsuite/gcc.target/i386/pr83782-1.c
> @@ -1,4 +1,4 @@
> -/* { dg-do compile } */
> +/* { dg-do compile { target { ! ia32 } } } */
>  /* { dg-require-ifunc "" } */
>  /* { dg-options "-O2 -fpic" } */
>
> @@ -20,7 +20,5 @@ bar(void)
>return foo;
>  }
>
> -/* { dg-final { scan-assembler {leal[ \t]foo@GOTOFF\(%[^,]*\),[ \t]%eax} { 
> target ia32 } } } */
> -/* { dg-final { scan-assembler {lea(?:l|q)[ \t]foo\(%rip\),[ \t]%(?:e|r)ax} 
> { target { ! ia32 } } } } */
> -/* { dg-final { scan-assembler-not "foo@GOT\\\(" { target ia32 } } } */
> -/* { dg-final { scan-assembler-not "foo@GOTPCREL\\\(" { target { ! ia32 } } 
> } } */
> +/* { dg-final { scan-assembler {lea(?:l|q)[ \t]foo\(%rip\),[ \t]%(?:e|r)ax} 
> } } */
> +/* { dg-final { scan-assembler-not "foo@GOTPCREL\\\(" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr83782-2.c 
> b/gcc/testsuite/gcc.target/i386/pr83782-2.c
> index e25d258bbda..a654ded771f 100644
> --- a/gcc/testsuite/gcc.target/i386/pr83782-2.c
> +++ b/gcc/testsuite/gcc.target/i386/pr83782-2.c
> @@ -1,4 +1,4 @@
> -/* { dg-do compile } */
> +/* { dg-do compile { target { ! ia32 } } } */
>  /* { dg-require-ifunc "" } */
>  /* { dg-options "-O2 -fpic" } */
>
> @@ -20,7 +20,5 @@ bar(void)
>return foo;
>  }
>
> -/* { dg-final { scan-assembler {leal[ \t]foo@GOTOFF\(%[^,]*\),[ \t]%eax} { 
> target ia32 } } } */
>  /* { dg-final { scan-assembler {lea(?:l|q)[ \t]foo\(%rip\),[ \t]%(?:e|r)ax} 
> { target { ! ia32 } } } } */
> -/* { dg-final { scan-assembler-not "foo@GOT\\\(" { target ia32 } } } */
>  /* { dg-final { scan-assembler-not "foo@GOTPCREL\\\(" { target { ! ia32 } } 
> } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr83782-3.c 
> b/gcc/testsuite/gcc.target/i386/pr83782-3.c
> new file mode 100644
> index 000..1536481cb79
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr83782-3.c
> @@ -0,0 +1,32 @@
> +/* { dg-do run }  */
> +/* { dg-require-ifunc "" } */
> +/* { dg-require-effective-target pie } */
> +/* { dg-options "-fpie -pie" } */
> +
> +#include 
> +
> +static int __attribute__((noinline))
> +implementation (void)
> +{
> +  printf ("'ere I am JH\n");
> +  return 0;
> +}
> +
> +static __typeof__ (implementation) *resolver (void)
> +{
> +  return (void *)implementation;
> +}
> +
> +extern int magic (void) __attribute__ 

Re: [PATCH] [PR83782] i386 PIE: avoid @GOTOFF for ifuncs and their aliases

2022-07-27 Thread H.J. Lu via Gcc-patches
On Tue, Jul 26, 2022 at 10:14 PM Alexandre Oliva  wrote:
>
>
> g++.dg/ext/attr-ifunc-3.C and gcc.target/i386/mvc10.c, not changed,
> have made it clear that there were problems in the optimizations to
> use @GOTOFF to refer to locally-bound ifuncs.  GNU ld as recently as
> May 2018 would reject such constructs, whereas later versions will
> silently accept but generate incorrect PIE with them (attr-ifunc-3.C)
> or still reject them if referenced through aliases (mvc10.c).  The use
> of @GOTOFF for locally-bound but externally-visible symbols
> (e.g. protected visibility) also breaks pointer identity if the
> canonical address ends up preempted by a PLT entry.  This patch
> modifies the local_symbolic_operand predicate to disable @GOTOFF for
> locally-bound symbols that would require @PLT for calls, restoring
> earlier behavior and disabling the optimization that has proven
> problematic even on amd64.  Eventually we may reintroduce the
> optimization, when the linker is fixed and we test for the fix before
> enabling it, and we exclude symbols whose canonical addresses may be
> preempted even when the symbol definition can't.  pr83782 tests have
> been adjusted to expect @GOT instead of @GOTOFF.
>
> Regstrapped on x86_64-linux-gnu; also tested, along with other patches
> I'm posting today with "i386 PIE" in the subject, and compared
> default-PIE and default-nonPIE results on it, and on i686-linux-gnu.  Ok
> to install?

Here is a different fix:

https://gcc.gnu.org/pipermail/gcc-patches/2022-July/598667.html

Use PLT doesn't mean that it can't be treated as local.  The problem
on ia32 is that PIC won't be set up properly for indirect call.There is
no problem on x86-64 and non-PIC on ia32.

>
> for  gcc/ChangeLog
>
> PR target/83782
> * config/i386/predicates.md (local_symbolic_operand): Disable
> GOTOFF even for locally-bound ifuncs.
> * config/i386/i386.cc (ix86_call_use_plt_p): Follow the alias
> chain looking for an ifunc, as in gcc.target/i386/mvc10.c.
>
> for  gcc/testsuite/ChangeLog
>
> PR target/83782
> * gcc.target/i386/pr83782-1.c: Adjust to require GOT rather
> than GOTOFF on ia32.
> * gcc.target/i386/pr83782-2.c: Likewise.
> ---
>  gcc/config/i386/i386.cc   |   16 ++--
>  gcc/config/i386/predicates.md |4 +++-
>  gcc/testsuite/gcc.target/i386/pr83782-1.c |4 ++--
>  gcc/testsuite/gcc.target/i386/pr83782-2.c |4 ++--
>  4 files changed, 17 insertions(+), 11 deletions(-)
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index aab28da4b5d4b..5c5dc8d2373ff 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -16058,13 +16058,17 @@ ix86_call_use_plt_p (rtx call_op)
>  {
>if (SYMBOL_REF_DECL (call_op)
>   && TREE_CODE (SYMBOL_REF_DECL (call_op)) == FUNCTION_DECL)
> -   {
> - /* NB: All ifunc functions must be called via PLT.  */
> - cgraph_node *node
> -   = cgraph_node::get (SYMBOL_REF_DECL (call_op));
> - if (node && node->ifunc_resolver)
> +   /* NB: All ifunc functions must be called via PLT, and we have
> +  to explicitly iterate over an alias chain looking for a
> +  node marked as an ifunc(_resolver) to tell.  That node is
> +  itself aliased to the actual resolver function, so
> +  ultimate_alias_target would skip the marker, and the call
> +  may be to another declaration aliased to the ifunc.  */
> +   for (cgraph_node *node
> +  = cgraph_node::get (SYMBOL_REF_DECL (call_op));
> +node && node->alias; node = node->get_alias_target ())
> + if (node->ifunc_resolver)
> return true;
> -   }
>return false;
>  }
>return true;
> diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
> index 42053ea7209f6..411c06e22e600 100644
> --- a/gcc/config/i386/predicates.md
> +++ b/gcc/config/i386/predicates.md
> @@ -596,7 +596,9 @@ (define_predicate "local_symbolic_operand"
>if (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op))
>  return false;
>if (SYMBOL_REF_LOCAL_P (op))
> -return true;
> +/* ifuncname@GOTOFF was rejected by the x86 linker before May
> +   2018, and silently generated wrong code for PIE afterwards.  */
> +return !ix86_call_use_plt_p (op);
>
>/* There is, however, a not insubstantial body of code in the rest of
>   the compiler that assumes it can just stick the results of
> diff --git a/gcc/testsuite/gcc.target/i386/pr83782-1.c 
> b/gcc/testsuite/gcc.target/i386/pr83782-1.c
> index ce97b12e65d58..af52278ec4df2 100644
> --- a/gcc/testsuite/gcc.target/i386/pr83782-1.c
> +++ b/gcc/testsuite/gcc.target/i386/pr83782-1.c
> @@ -20,7 +20,7 @@ bar(void)
>return foo;
>  }
>
> -/* { dg-final { scan-assembler {leal[ \t]foo@GOTOFF\(%[^,]*\),[ \t]%eax} { 
> target ia32 } } } */
> +/* { dg-final { 

[GCC 12] [PATCH] x86: Support 2/4/8 byte constant vector stores

2022-07-27 Thread H.J. Lu via Gcc-patches
On Fri, Jul 1, 2022 at 8:31 AM Uros Bizjak  wrote:
>
> On Thu, Jun 30, 2022 at 4:50 PM H.J. Lu  wrote:
> >
> > 1. Add a predicate for constant vectors which can be converted to integer
> > constants suitable for constant integer stores.  For a 8-byte constant
> > vector, the converted 64-bit integer must be valid for store with 64-bit
> > immediate, which is a 64-bit integer sign-extended from a 32-bit integer.
> > 2. Add a new pattern to allow 2-byte, 4-byte and 8-byte constant vector
> > stores, like
> >
> > (set (mem:V2HI (reg:DI 84))
> >  (const_vector:V2HI [(const_int 0 [0]) (const_int 1 [0x1])]))
> >
> > 3. After reload, convert constant vector stores to constant integer
> > stores, like
> >
> > (set (mem:SI (reg:DI 5 di [84]))
> >  (const_int 65536 [0x1]))
> >
> > For
> >
> > void
> > foo (short * c)
> > {
> >   c[0] = 0;
> >   c[1] = 1;
> > }
> >
> > it generates
> >
> > movl$65536, (%rdi)
> >
> > instead of
> >
> > movl.LC0(%rip), %eax
> > movl%eax, (%rdi)
> >
> > gcc/
> >
> > PR target/106022
> > * config/i386/i386-protos.h (ix86_convert_const_vector_to_integer):
> > New.
> > * config/i386/i386.cc (ix86_convert_const_vector_to_integer):
> > New.
> > * config/i386/mmx.md (V_16_32_64): New.
> > (*mov_imm): New patterns for stores with 16-bit, 32-bit
> > and 64-bit constant vector.
> > * config/i386/predicates.md (x86_64_const_vector_operand): New.
> >
> > gcc/testsuite/
> >
> > PR target/106022
> > * gcc.target/i386/pr106022-1.c: New test.
> > * gcc.target/i386/pr106022-2.c: Likewise.
> > * gcc.target/i386/pr106022-3.c: Likewise.
> > * gcc.target/i386/pr106022-4.c: Likewise.
>
> OK.

OK to backport to GCC 12 branch?

> Thanks,
> Uros.
>
> > ---
> >  gcc/config/i386/i386-protos.h  |  2 +
> >  gcc/config/i386/i386.cc| 47 ++
> >  gcc/config/i386/mmx.md | 37 +
> >  gcc/config/i386/predicates.md  | 11 +
> >  gcc/testsuite/gcc.target/i386/pr106022-1.c | 13 ++
> >  gcc/testsuite/gcc.target/i386/pr106022-2.c | 14 +++
> >  gcc/testsuite/gcc.target/i386/pr106022-3.c | 14 +++
> >  gcc/testsuite/gcc.target/i386/pr106022-4.c | 14 +++
> >  8 files changed, 152 insertions(+)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106022-1.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106022-2.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106022-3.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106022-4.c
> >
> > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
> > index 3596ce81ecf..cf847751ac5 100644
> > --- a/gcc/config/i386/i386-protos.h
> > +++ b/gcc/config/i386/i386-protos.h
> > @@ -122,6 +122,8 @@ extern void ix86_expand_unary_operator (enum rtx_code, 
> > machine_mode,
> > rtx[]);
> >  extern rtx ix86_build_const_vector (machine_mode, bool, rtx);
> >  extern rtx ix86_build_signbit_mask (machine_mode, bool, bool);
> > +extern HOST_WIDE_INT ix86_convert_const_vector_to_integer (rtx,
> > +  machine_mode);
> >  extern void ix86_split_convert_uns_si_sse (rtx[]);
> >  extern void ix86_expand_convert_uns_didf_sse (rtx, rtx);
> >  extern void ix86_expand_convert_uns_sixf_sse (rtx, rtx);
> > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> > index b15b4893bb9..0cfe9962f75 100644
> > --- a/gcc/config/i386/i386.cc
> > +++ b/gcc/config/i386/i386.cc
> > @@ -15723,6 +15723,53 @@ ix86_build_signbit_mask (machine_mode mode, bool 
> > vect, bool invert)
> >return force_reg (vec_mode, v);
> >  }
> >
> > +/* Return HOST_WIDE_INT for const vector OP in MODE.  */
> > +
> > +HOST_WIDE_INT
> > +ix86_convert_const_vector_to_integer (rtx op, machine_mode mode)
> > +{
> > +  if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
> > +gcc_unreachable ();
> > +
> > +  int nunits = GET_MODE_NUNITS (mode);
> > +  wide_int val = wi::zero (GET_MODE_BITSIZE (mode));
> > +  machine_mode innermode = GET_MODE_INNER (mode);
> > +  unsigned int innermode_bits = GET_MODE_BITSIZE (innermode);
> > +
> > +  switch (mode)
> > +{
> > +case E_V2QImode:
> > +case E_V4QImode:
> > +case E_V2HImode:
> > +case E_V8QImode:
> > +case E_V4HImode:
> > +case E_V2SImode:
> > +  for (int i = 0; i < nunits; ++i)
> > +   {
> > + int v = INTVAL (XVECEXP (op, 0, i));
> > + wide_int wv = wi::shwi (v, innermode_bits);
> > + val = wi::insert (val, wv, innermode_bits * i, innermode_bits);
> > +   }
> > +  break;
> > +case E_V2HFmode:
> > +case E_V4HFmode:
> > +case E_V2SFmode:
> > +  for (int i = 0; i < nunits; ++i)
> > +   {
> > + rtx x = XVECEXP (op, 0, i);
> > + int v = real_to_target (NULL, CONST_DOUBLE_REAL_VALUE (x),
> > 

Re: [PATCH] tree-optimization/106379 - add missing ~(a ^ b) folding for _Bool

2022-07-25 Thread H.J. Lu via Gcc-patches
On Fri, Jul 22, 2022 at 11:10 PM Richard Biener via Gcc-patches
 wrote:
>
>
>
> > Am 22.07.2022 um 22:17 schrieb H.J. Lu via Gcc-patches 
> > :
> >
> > On Thu, Jul 21, 2022 at 4:24 AM Richard Biener via Gcc-patches
> >  wrote:
> >>
> >> The following makes sure to fold ~(a ^ b) to a == b for truth
> >> values (but not vectors, we'd have to check for vector support of
> >> equality).  That turns the PR106379 testcase into a ranger one.
> >>
> >> Note that while we arrive at ~(a ^ b) in a convoluted way from
> >> original !a == !b one can eventually write the expression this
> >> way directly as well.
> >>
> >> Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.
> >>
> >>PR tree-optimization/106379
> >>* match.pd (~(a ^ b) -> a == b): New pattern.
> >>
> >>* gcc.dg/pr106379-1.c: New testcase.
> >> ---
> >> gcc/match.pd  | 6 ++
> >> gcc/testsuite/gcc.dg/pr106379-1.c | 9 +
> >> 2 files changed, 15 insertions(+)
> >> create mode 100644 gcc/testsuite/gcc.dg/pr106379-1.c
> >>
> >> diff --git a/gcc/match.pd b/gcc/match.pd
> >> index 8bbc0dbd5cd..88a1a5aa9cc 100644
> >> --- a/gcc/match.pd
> >> +++ b/gcc/match.pd
> >> @@ -1938,6 +1938,12 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >>  (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
> >>   (bit_not (bit_xor (view_convert @0) @1
> >>
> >> +/* ~(a ^ b) is a == b for truth valued a and b.  */
> >> +(simplify
> >> + (bit_not (bit_xor:s truth_valued_p@0 truth_valued_p@1))
> >> + (if (!VECTOR_TYPE_P (type))
> >> +  (convert (eq @0 @1
> >
> > For integers, isn't it wrong to convert ~(boolean exp) to boolean exp?
>
> That’s what the (convert. …) should compensate for?

Is ~(boolean exp) == ~((int) (boolean exp)) or (int) (~(boolean exp))?

> Richard
>
> >
> >> /* (x & ~m) | (y & m) -> ((x ^ y) & m) ^ x */
> >> (simplify
> >>  (bit_ior:c (bit_and:cs @0 (bit_not @2)) (bit_and:cs @1 @2))
> >> diff --git a/gcc/testsuite/gcc.dg/pr106379-1.c 
> >> b/gcc/testsuite/gcc.dg/pr106379-1.c
> >> new file mode 100644
> >> index 000..7f2575e02dc
> >> --- /dev/null
> >> +++ b/gcc/testsuite/gcc.dg/pr106379-1.c
> >> @@ -0,0 +1,9 @@
> >> +/* { dg-do compile } */
> >> +/* { dg-options "-O -fdump-tree-forwprop1" } */
> >> +
> >> +_Bool foo (_Bool a, _Bool b)
> >> +{
> >> +  return !a == !b;
> >> +}
> >> +
> >> +/* { dg-final { scan-tree-dump "\[ab\]_\[0-9\]+\\(D\\) == 
> >> \[ba\]_\[0-9\]+\\(D\\)" "forwprop1" } } */
> >> --
> >> 2.35.3
> >
> >
> >
> > --
> > H.J.



-- 
H.J.


Re: [PATCH] tree-optimization/106379 - add missing ~(a ^ b) folding for _Bool

2022-07-22 Thread H.J. Lu via Gcc-patches
On Thu, Jul 21, 2022 at 4:24 AM Richard Biener via Gcc-patches
 wrote:
>
> The following makes sure to fold ~(a ^ b) to a == b for truth
> values (but not vectors, we'd have to check for vector support of
> equality).  That turns the PR106379 testcase into a ranger one.
>
> Note that while we arrive at ~(a ^ b) in a convoluted way from
> original !a == !b one can eventually write the expression this
> way directly as well.
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.
>
> PR tree-optimization/106379
> * match.pd (~(a ^ b) -> a == b): New pattern.
>
> * gcc.dg/pr106379-1.c: New testcase.
> ---
>  gcc/match.pd  | 6 ++
>  gcc/testsuite/gcc.dg/pr106379-1.c | 9 +
>  2 files changed, 15 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.dg/pr106379-1.c
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 8bbc0dbd5cd..88a1a5aa9cc 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -1938,6 +1938,12 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>   (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>(bit_not (bit_xor (view_convert @0) @1
>
> +/* ~(a ^ b) is a == b for truth valued a and b.  */
> +(simplify
> + (bit_not (bit_xor:s truth_valued_p@0 truth_valued_p@1))
> + (if (!VECTOR_TYPE_P (type))
> +  (convert (eq @0 @1

For integers, isn't it wrong to convert ~(boolean exp) to boolean exp?


>  /* (x & ~m) | (y & m) -> ((x ^ y) & m) ^ x */
>  (simplify
>   (bit_ior:c (bit_and:cs @0 (bit_not @2)) (bit_and:cs @1 @2))
> diff --git a/gcc/testsuite/gcc.dg/pr106379-1.c 
> b/gcc/testsuite/gcc.dg/pr106379-1.c
> new file mode 100644
> index 000..7f2575e02dc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/pr106379-1.c
> @@ -0,0 +1,9 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O -fdump-tree-forwprop1" } */
> +
> +_Bool foo (_Bool a, _Bool b)
> +{
> +  return !a == !b;
> +}
> +
> +/* { dg-final { scan-tree-dump "\[ab\]_\[0-9\]+\\(D\\) == 
> \[ba\]_\[0-9\]+\\(D\\)" "forwprop1" } } */
> --
> 2.35.3



--
H.J.


[PATCH] x86: Add ix86_ifunc_ref_local_ok

2022-07-21 Thread H.J. Lu via Gcc-patches
We can't always use the PLT entry as the function address for local IFUNC
functions.  When the PIC register is needed for PLT call, indirect call
via the PLT entry will fail since the PIC register may not be set up
properly for indirect call.  Add ix86_ifunc_ref_local_ok to return false
when the PLT entry can't be used as local IFUNC function pointers.

gcc/

PR target/83782
* config/i386/i386.cc (ix86_ifunc_ref_local_ok): New.
(TARGET_IFUNC_REF_LOCAL_OK): Use it.

gcc/testsuite/

PR target/83782
* gcc.target/i386/pr83782-1.c: Require non-ia32.
* gcc.target/i386/pr83782-2.c: Likewise.
* gcc.target/i386/pr83782-3.c: New test.
---
 gcc/config/i386/i386.cc   | 15 ++-
 gcc/testsuite/gcc.target/i386/pr83782-1.c |  8 +++---
 gcc/testsuite/gcc.target/i386/pr83782-2.c |  4 +--
 gcc/testsuite/gcc.target/i386/pr83782-3.c | 32 +++
 4 files changed, 50 insertions(+), 9 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr83782-3.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index e03f86d4a23..5e30dc884bf 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -16070,6 +16070,19 @@ ix86_call_use_plt_p (rtx call_op)
   return true;
 }
 
+/* Implement TARGET_IFUNC_REF_LOCAL_OK.  If this hook returns true,
+   the PLT entry will be used as the function address for local IFUNC
+   functions.  When the PIC register is needed for PLT call, indirect
+   call via the PLT entry will fail since the PIC register may not be
+   set up properly for indirect call.  In this case, we should return
+   false.  */
+
+static bool
+ix86_ifunc_ref_local_ok (void)
+{
+  return !flag_pic || (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC);
+}
+
 /* Return true if the function being called was marked with attribute
"noplt" or using -fno-plt and we are compiling for non-PIC.  We need
to handle the non-PIC case in the backend because there is no easy
@@ -24953,7 +24966,7 @@ ix86_libgcc_floating_mode_supported_p
   ix86_get_multilib_abi_name
 
 #undef TARGET_IFUNC_REF_LOCAL_OK
-#define TARGET_IFUNC_REF_LOCAL_OK hook_bool_void_true
+#define TARGET_IFUNC_REF_LOCAL_OK ix86_ifunc_ref_local_ok
 
 #if !TARGET_MACHO && !TARGET_DLLIMPORT_DECL_ATTRIBUTES
 # undef TARGET_ASM_RELOC_RW_MASK
diff --git a/gcc/testsuite/gcc.target/i386/pr83782-1.c 
b/gcc/testsuite/gcc.target/i386/pr83782-1.c
index ce97b12e65d..85674346aec 100644
--- a/gcc/testsuite/gcc.target/i386/pr83782-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr83782-1.c
@@ -1,4 +1,4 @@
-/* { dg-do compile } */
+/* { dg-do compile { target { ! ia32 } } } */
 /* { dg-require-ifunc "" } */
 /* { dg-options "-O2 -fpic" } */
 
@@ -20,7 +20,5 @@ bar(void)
   return foo;
 }
 
-/* { dg-final { scan-assembler {leal[ \t]foo@GOTOFF\(%[^,]*\),[ \t]%eax} { 
target ia32 } } } */
-/* { dg-final { scan-assembler {lea(?:l|q)[ \t]foo\(%rip\),[ \t]%(?:e|r)ax} { 
target { ! ia32 } } } } */
-/* { dg-final { scan-assembler-not "foo@GOT\\\(" { target ia32 } } } */
-/* { dg-final { scan-assembler-not "foo@GOTPCREL\\\(" { target { ! ia32 } } } 
} */
+/* { dg-final { scan-assembler {lea(?:l|q)[ \t]foo\(%rip\),[ \t]%(?:e|r)ax} } 
} */
+/* { dg-final { scan-assembler-not "foo@GOTPCREL\\\(" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr83782-2.c 
b/gcc/testsuite/gcc.target/i386/pr83782-2.c
index e25d258bbda..a654ded771f 100644
--- a/gcc/testsuite/gcc.target/i386/pr83782-2.c
+++ b/gcc/testsuite/gcc.target/i386/pr83782-2.c
@@ -1,4 +1,4 @@
-/* { dg-do compile } */
+/* { dg-do compile { target { ! ia32 } } } */
 /* { dg-require-ifunc "" } */
 /* { dg-options "-O2 -fpic" } */
 
@@ -20,7 +20,5 @@ bar(void)
   return foo;
 }
 
-/* { dg-final { scan-assembler {leal[ \t]foo@GOTOFF\(%[^,]*\),[ \t]%eax} { 
target ia32 } } } */
 /* { dg-final { scan-assembler {lea(?:l|q)[ \t]foo\(%rip\),[ \t]%(?:e|r)ax} { 
target { ! ia32 } } } } */
-/* { dg-final { scan-assembler-not "foo@GOT\\\(" { target ia32 } } } */
 /* { dg-final { scan-assembler-not "foo@GOTPCREL\\\(" { target { ! ia32 } } } 
} */
diff --git a/gcc/testsuite/gcc.target/i386/pr83782-3.c 
b/gcc/testsuite/gcc.target/i386/pr83782-3.c
new file mode 100644
index 000..1536481cb79
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr83782-3.c
@@ -0,0 +1,32 @@
+/* { dg-do run }  */
+/* { dg-require-ifunc "" } */
+/* { dg-require-effective-target pie } */
+/* { dg-options "-fpie -pie" } */
+
+#include 
+
+static int __attribute__((noinline))
+implementation (void)
+{
+  printf ("'ere I am JH\n");
+  return 0;
+}
+
+static __typeof__ (implementation) *resolver (void)
+{
+  return (void *)implementation;
+}
+
+extern int magic (void) __attribute__ ((ifunc ("resolver")));
+
+__attribute__ ((weak))
+int
+call_magic (int (*ptr) (void))
+{
+  return ptr ();
+}
+
+int main ()
+{
+  return call_magic (magic);
+}
-- 
2.36.1



[PATCH] stack-protector: Check stack canary for noreturn function

2022-07-14 Thread H.J. Lu via Gcc-patches
Check stack canary for noreturn function to catch stack corruption
before calling noreturn function.  For C++, check stack canary when
throwing exception or resuming stack unwind to avoid corrupted stack.

gcc/

PR middle-end/58245
* calls.cc (expand_call): Check stack canary for noreturn
function.

gcc/testsuite/

PR middle-end/58245
* c-c++-common/pr58245-1.c: New test.
* g++.dg/pr58245-1.C: Likewise.
* g++.dg/fstack-protector-strong.C: Adjusted.
---
 gcc/calls.cc   |  7 ++-
 gcc/testsuite/c-c++-common/pr58245-1.c | 12 
 gcc/testsuite/g++.dg/fstack-protector-strong.C |  2 +-
 gcc/testsuite/g++.dg/pr58245-1.C   | 10 ++
 4 files changed, 29 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/c-c++-common/pr58245-1.c
 create mode 100644 gcc/testsuite/g++.dg/pr58245-1.C

diff --git a/gcc/calls.cc b/gcc/calls.cc
index bc96aff38f0..7816c2c8d99 100644
--- a/gcc/calls.cc
+++ b/gcc/calls.cc
@@ -3154,7 +3154,12 @@ expand_call (tree exp, rtx target, int ignore)
   if (pass && (flags & ECF_MALLOC))
start_sequence ();
 
-  if (pass == 0
+  /* Check the canary value for sibcall or function which doesn't
+return.  */
+  if ((pass == 0
+  || ((flags & ECF_NORETURN) != 0
+  && (fndecl
+  != get_callee_fndecl (targetm.stack_protect_fail ()
  && crtl->stack_protect_guard
  && targetm.stack_protect_runtime_enabled_p ())
stack_protect_epilogue ();
diff --git a/gcc/testsuite/c-c++-common/pr58245-1.c 
b/gcc/testsuite/c-c++-common/pr58245-1.c
new file mode 100644
index 000..945acc53004
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/pr58245-1.c
@@ -0,0 +1,12 @@
+/* { dg-do compile { target i?86-*-* x86_64-*-* rs6000-*-* s390x-*-* } } */
+/* { dg-options "-O2 -fstack-protector-all" } */
+
+extern void foo (void) __attribute__ ((noreturn));
+
+void
+bar (void)
+{
+  foo ();
+}
+
+/* { dg-final { scan-assembler-times "stack_chk_fail" 1 } } */
diff --git a/gcc/testsuite/g++.dg/fstack-protector-strong.C 
b/gcc/testsuite/g++.dg/fstack-protector-strong.C
index ae6d2fdb8df..034af2ce9ab 100644
--- a/gcc/testsuite/g++.dg/fstack-protector-strong.C
+++ b/gcc/testsuite/g++.dg/fstack-protector-strong.C
@@ -85,4 +85,4 @@ int foo7 (B *p)
   return p->return_slot ().a1;
 }
 
-/* { dg-final { scan-assembler-times "stack_chk_fail" 7 } } */
+/* { dg-final { scan-assembler-times "stack_chk_fail" 8 } } */
diff --git a/gcc/testsuite/g++.dg/pr58245-1.C b/gcc/testsuite/g++.dg/pr58245-1.C
new file mode 100644
index 000..1439bc62e71
--- /dev/null
+++ b/gcc/testsuite/g++.dg/pr58245-1.C
@@ -0,0 +1,10 @@
+/* { dg-do compile { target i?86-*-* x86_64-*-* rs6000-*-* s390x-*-* } } */
+/* { dg-options "-O2 -fstack-protector-all" } */
+
+void
+bar (void)
+{
+  throw 1;
+}
+
+/* { dg-final { scan-assembler-times "stack_chk_fail" 1 } } */
-- 
2.36.1



Re: [PATCH v3] Simplify memchr with small constant strings

2022-07-14 Thread H.J. Lu via Gcc-patches
On Wed, Jul 13, 2022 at 11:42 PM Richard Biener
 wrote:
>
> On Wed, Jul 13, 2022 at 6:50 PM H.J. Lu  wrote:
> >
> > When memchr is applied on a constant string of no more than the bytes of
> > a word, simplify memchr by checking each byte in the constant string.
> >
> > int f (int a)
> > {
> >return  __builtin_memchr ("AE", a, 2) != 0;
> > }
> >
> > is simplified to
> >
> > int f (int a)
> > {
> >   return ((char) a == 'A' || (char) a == 'E') != 0;
> > }
> >
> > gcc/
> >
> > PR tree-optimization/103798
> > * tree-ssa-forwprop.cc: Include "tree-ssa-strlen.h".
> > (simplify_builtin_call): Inline memchr with constant strings of
> > no more than the bytes of a word.
> > * tree-ssa-strlen.cc (use_in_zero_equality): Make it global.
> > * tree-ssa-strlen.h (use_in_zero_equality): New.
> >
> > gcc/testsuite/
> >
> > PR tree-optimization/103798
> > * c-c++-common/pr103798-1.c: New test.
> > * c-c++-common/pr103798-2.c: Likewise.
> > * c-c++-common/pr103798-3.c: Likewise.
> > * c-c++-common/pr103798-4.c: Likewise.
> > * c-c++-common/pr103798-5.c: Likewise.
> > * c-c++-common/pr103798-6.c: Likewise.
> > * c-c++-common/pr103798-7.c: Likewise.
> > * c-c++-common/pr103798-8.c: Likewise.
> > * c-c++-common/pr103798-9.c: Likewise.
> > * c-c++-common/pr103798-10.c: Likewise.
> > ---
> >  gcc/testsuite/c-c++-common/pr103798-1.c  | 28 +
> >  gcc/testsuite/c-c++-common/pr103798-10.c | 10 
> >  gcc/testsuite/c-c++-common/pr103798-2.c  | 30 ++
> >  gcc/testsuite/c-c++-common/pr103798-3.c  | 28 +
> >  gcc/testsuite/c-c++-common/pr103798-4.c  | 28 +
> >  gcc/testsuite/c-c++-common/pr103798-5.c  | 26 +
> >  gcc/testsuite/c-c++-common/pr103798-6.c  | 27 +
> >  gcc/testsuite/c-c++-common/pr103798-7.c  | 27 +
> >  gcc/testsuite/c-c++-common/pr103798-8.c  | 27 +
> >  gcc/testsuite/c-c++-common/pr103798-9.c  | 10 
> >  gcc/tree-ssa-forwprop.cc | 73 
> >  gcc/tree-ssa-strlen.cc   |  4 +-
> >  gcc/tree-ssa-strlen.h|  2 +
> >  13 files changed, 318 insertions(+), 2 deletions(-)
> >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-1.c
> >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-10.c
> >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-2.c
> >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-3.c
> >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-4.c
> >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-5.c
> >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-6.c
> >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-7.c
> >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-8.c
> >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-9.c
> >
> > diff --git a/gcc/testsuite/c-c++-common/pr103798-1.c 
> > b/gcc/testsuite/c-c++-common/pr103798-1.c
> > new file mode 100644
> > index 000..cd3edf569fc
> > --- /dev/null
> > +++ b/gcc/testsuite/c-c++-common/pr103798-1.c
> > @@ -0,0 +1,28 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -fdump-tree-optimized -save-temps" } */
> > +
> > +__attribute__ ((weak))
> > +int
> > +f (char a)
> > +{
> > +   return  __builtin_memchr ("a", a, 1) == 0;
> > +}
> > +
> > +__attribute__ ((weak))
> > +int
> > +g (char a)
> > +{
> > +  return a != 'a';
> > +}
> > +
> > +int
> > +main ()
> > +{
> > + for (int i = 0; i < 255; i++)
> > +   if (f (i) != g (i))
> > + __builtin_abort ();
> > +
> > + return 0;
> > +}
> > +
> > +/* { dg-final { scan-assembler-not "memchr" } } */
> > diff --git a/gcc/testsuite/c-c++-common/pr103798-10.c 
> > b/gcc/testsuite/c-c++-common/pr103798-10.c
> > new file mode 100644
> > index 000..4677e9539fa
> > --- /dev/null
> > +++ b/gcc/testsuite/c-c++-common/pr103798-10.c
> > @@ -0,0 +1,10 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-Os -fdump-tree-optimized -save-temps" } */
> > +
> > +int
> > +f (char a)
> > +{
> > +  return  __builtin_memchr ("ac", a, 1) == 0;
> > +}
> > +
> > +/* { dg-final { scan-assembler "memchr" } } */
> > diff --git a/gcc/testsuite/c-c++-common/pr103798-2.c 
> > b/gcc/testsuite/c-c++-common/pr103798-2.c
> > new file mode 100644
> > index 000..e7e99c3679e
> > --- /dev/null
> > +++ b/gcc/testsuite/c-c++-common/pr103798-2.c
> > @@ -0,0 +1,30 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -fdump-tree-optimized -save-temps" } */
> > +
> > +#include 
> > +
> > +__attribute__ ((weak))
> > +int
> > +f (int a)
> > +{
> > +   return memchr ("aE", a, 2) != NULL;
> > +}
> > +
> > +__attribute__ ((weak))
> > +int
> > +g (char a)
> > +{
> > +  return a == 'a' || a == 'E';
> > +}
> > +
> > +int
> > +main ()
> > +{
> > + for (int i = 0; i < 255; i++)
> > +   if (f (i + 256) != g (i + 256))
> > + __builtin_abort ();
> > +
> > + return 0;
> > +}
> > +
> > +/* { dg-final { 

[PATCH] x86: Disable sibcall if indirect_return attribute doesn't match

2022-07-14 Thread H.J. Lu via Gcc-patches
When shadow stack is enabled, function with indirect_return attribute
may return via indirect jump.  In this case, we need to disable sibcall
if caller doesn't have indirect_return attribute and indirect branch
tracking is enabled since compiler won't generate ENDBR when calling the
caller.

gcc/

PR target/85620
* config/i386/i386.cc (ix86_function_ok_for_sibcall): Return
false if callee has indirect_return attribute and caller
doesn't.

gcc/testsuite/

PR target/85620
* gcc.target/i386/pr85620-2.c: Updated.
* gcc.target/i386/pr85620-5.c: New test.
* gcc.target/i386/pr85620-6.c: Likewise.
* gcc.target/i386/pr85620-7.c: Likewise.
---
 gcc/config/i386/i386.cc   | 10 ++
 gcc/testsuite/gcc.target/i386/pr85620-2.c |  3 ++-
 gcc/testsuite/gcc.target/i386/pr85620-5.c | 13 +
 gcc/testsuite/gcc.target/i386/pr85620-6.c | 14 ++
 gcc/testsuite/gcc.target/i386/pr85620-7.c | 14 ++
 5 files changed, 53 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr85620-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr85620-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr85620-7.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 3a3c7299eb4..e03f86d4a23 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -1024,6 +1024,16 @@ ix86_function_ok_for_sibcall (tree decl, tree exp)
 return false;
 }
 
+  /* Disable sibcall if callee has indirect_return attribute and
+ caller doesn't since callee will return to the caller's caller
+ via an indirect jump.  */
+  if (((flag_cf_protection & (CF_RETURN | CF_BRANCH))
+   == (CF_RETURN | CF_BRANCH))
+  && lookup_attribute ("indirect_return", TYPE_ATTRIBUTES (type))
+  && !lookup_attribute ("indirect_return",
+   TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl
+return false;
+
   /* Otherwise okay.  That also includes certain types of indirect calls.  */
   return true;
 }
diff --git a/gcc/testsuite/gcc.target/i386/pr85620-2.c 
b/gcc/testsuite/gcc.target/i386/pr85620-2.c
index b2e680fa1fe..14ce0ffd1e1 100644
--- a/gcc/testsuite/gcc.target/i386/pr85620-2.c
+++ b/gcc/testsuite/gcc.target/i386/pr85620-2.c
@@ -1,6 +1,7 @@
 /* { dg-do compile } */
 /* { dg-options "-O2 -fcf-protection" } */
-/* { dg-final { scan-assembler-times {\mendbr} 1 } } */
+/* { dg-final { scan-assembler-times {\mendbr} 2 } } */
+/* { dg-final { scan-assembler-not "jmp" } } */
 
 struct ucontext;
 
diff --git a/gcc/testsuite/gcc.target/i386/pr85620-5.c 
b/gcc/testsuite/gcc.target/i386/pr85620-5.c
new file mode 100644
index 000..04537702d09
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr85620-5.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fcf-protection" } */
+/* { dg-final { scan-assembler-not "jmp" } } */
+
+struct ucontext;
+
+extern int (*bar) (struct ucontext *) __attribute__((__indirect_return__));
+
+int
+foo (struct ucontext *oucp)
+{
+  return bar (oucp);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr85620-6.c 
b/gcc/testsuite/gcc.target/i386/pr85620-6.c
new file mode 100644
index 000..0b6a64e8454
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr85620-6.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fcf-protection" } */
+/* { dg-final { scan-assembler "jmp" } } */
+
+struct ucontext;
+
+extern int bar (struct ucontext *) __attribute__((__indirect_return__));
+
+__attribute__((__indirect_return__))
+int
+foo (struct ucontext *oucp)
+{
+  return bar (oucp);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr85620-7.c 
b/gcc/testsuite/gcc.target/i386/pr85620-7.c
new file mode 100644
index 000..fa62d56decf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr85620-7.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fcf-protection" } */
+/* { dg-final { scan-assembler "jmp" } } */
+
+struct ucontext;
+
+extern int (*bar) (struct ucontext *) __attribute__((__indirect_return__));
+extern int foo (struct ucontext *) __attribute__((__indirect_return__));
+
+int
+foo (struct ucontext *oucp)
+{
+  return bar (oucp);
+}
-- 
2.36.1



Re: [PATCH v2] Simplify memchr with small constant strings

2022-07-13 Thread H.J. Lu via Gcc-patches
On Wed, Jul 13, 2022 at 5:35 AM Richard Biener
 wrote:
>
> On Tue, Jul 12, 2022 at 6:59 PM H.J. Lu  wrote:
> >
> > On Fri, Jul 8, 2022 at 5:54 AM Richard Biener
> >  wrote:
> > >
> > > On Thu, Jul 7, 2022 at 6:45 PM H.J. Lu  wrote:
> > > >
> > > > When memchr is applied on a constant string of no more than the bytes of
> > > > a word, simplify memchr by checking each byte in the constant string.
> > > >
> > > > int f (int a)
> > > > {
> > > >return  __builtin_memchr ("AE", a, 2) != 0;
> > > > }
> > > >
> > > > is simplified to
> > > >
> > > > int f (int a)
> > > > {
> > > >   return ((char) a == 'A' || (char) a == 'E') != 0;
> > > > }
> > > >
> > > > gcc/
> > > >
> > > > PR tree-optimization/103798
> > > > * tree-ssa-forwprop.cc: Include "tree-ssa-strlen.h".
> > > > (simplify_builtin_call): Inline memchr with constant strings of
> > > > no more than the bytes of a word.
> > > > * tree-ssa-strlen.cc (use_in_zero_equality): Make it global.
> > > > * tree-ssa-strlen.h (use_in_zero_equality): New.
> > > >
> > > > gcc/testsuite/
> > > >
> > > > PR tree-optimization/103798
> > > > * c-c++-common/pr103798-1.c: New test.
> > > > * c-c++-common/pr103798-2.c: Likewise.
> > > > * c-c++-common/pr103798-3.c: Likewise.
> > > > * c-c++-common/pr103798-4.c: Likewise.
> > > > * c-c++-common/pr103798-5.c: Likewise.
> > > > * c-c++-common/pr103798-6.c: Likewise.
> > > > * c-c++-common/pr103798-7.c: Likewise.
> > > > * c-c++-common/pr103798-8.c: Likewise.
> > > > ---
> > > >  gcc/testsuite/c-c++-common/pr103798-1.c | 28 +++
> > > >  gcc/testsuite/c-c++-common/pr103798-2.c | 30 
> > > >  gcc/testsuite/c-c++-common/pr103798-3.c | 28 +++
> > > >  gcc/testsuite/c-c++-common/pr103798-4.c | 28 +++
> > > >  gcc/testsuite/c-c++-common/pr103798-5.c | 26 ++
> > > >  gcc/testsuite/c-c++-common/pr103798-6.c | 27 +++
> > > >  gcc/testsuite/c-c++-common/pr103798-7.c | 27 +++
> > > >  gcc/testsuite/c-c++-common/pr103798-8.c | 27 +++
> > > >  gcc/tree-ssa-forwprop.cc| 64 +
> > > >  gcc/tree-ssa-strlen.cc  |  4 +-
> > > >  gcc/tree-ssa-strlen.h   |  2 +
> > > >  11 files changed, 289 insertions(+), 2 deletions(-)
> > > >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-1.c
> > > >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-2.c
> > > >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-3.c
> > > >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-4.c
> > > >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-5.c
> > > >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-6.c
> > > >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-7.c
> > > >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-8.c
> > > >
> > > > diff --git a/gcc/testsuite/c-c++-common/pr103798-1.c 
> > > > b/gcc/testsuite/c-c++-common/pr103798-1.c
> > > > new file mode 100644
> > > > index 000..cd3edf569fc
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/c-c++-common/pr103798-1.c
> > > > @@ -0,0 +1,28 @@
> > > > +/* { dg-do run } */
> > > > +/* { dg-options "-O2 -fdump-tree-optimized -save-temps" } */
> > > > +
> > > > +__attribute__ ((weak))
> > > > +int
> > > > +f (char a)
> > > > +{
> > > > +   return  __builtin_memchr ("a", a, 1) == 0;
> > > > +}
> > > > +
> > > > +__attribute__ ((weak))
> > > > +int
> > > > +g (char a)
> > > > +{
> > > > +  return a != 'a';
> > > > +}
> > > > +
> > > > +int
> > > > +main ()
> > > > +{
> > > > + for (int i = 0; i < 255; i++)
> > > > +   if (f (i) != g (i))
> > > > + __builtin_abort ();
> > > > +
> > > > + return 0;
> > > > +}
> > > > +
> > > > +/* { dg-final { scan-assembler-not "memchr" } } */
> > > > diff --git a/gcc/testsuite/c-c++-common/pr103798-2.c 
> > > > b/gcc/testsuite/c-c++-common/pr103798-2.c
> > > > new file mode 100644
> > > > index 000..e7e99c3679e
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/c-c++-common/pr103798-2.c
> > > > @@ -0,0 +1,30 @@
> > > > +/* { dg-do run } */
> > > > +/* { dg-options "-O2 -fdump-tree-optimized -save-temps" } */
> > > > +
> > > > +#include 
> > > > +
> > > > +__attribute__ ((weak))
> > > > +int
> > > > +f (int a)
> > > > +{
> > > > +   return memchr ("aE", a, 2) != NULL;
> > > > +}
> > > > +
> > > > +__attribute__ ((weak))
> > > > +int
> > > > +g (char a)
> > > > +{
> > > > +  return a == 'a' || a == 'E';
> > > > +}
> > > > +
> > > > +int
> > > > +main ()
> > > > +{
> > > > + for (int i = 0; i < 255; i++)
> > > > +   if (f (i + 256) != g (i + 256))
> > > > + __builtin_abort ();
> > > > +
> > > > + return 0;
> > > > +}
> > > > +
> > > > +/* { dg-final { scan-assembler-not "memchr" } } */
> > > > diff --git a/gcc/testsuite/c-c++-common/pr103798-3.c 
> > > > b/gcc/testsuite/c-c++-common/pr103798-3.c
> > > > new file mode 100644
> > > > index 

[PATCH v3] Simplify memchr with small constant strings

2022-07-13 Thread H.J. Lu via Gcc-patches
When memchr is applied on a constant string of no more than the bytes of
a word, simplify memchr by checking each byte in the constant string.

int f (int a)
{
   return  __builtin_memchr ("AE", a, 2) != 0;
}

is simplified to

int f (int a)
{
  return ((char) a == 'A' || (char) a == 'E') != 0;
}

gcc/

PR tree-optimization/103798
* tree-ssa-forwprop.cc: Include "tree-ssa-strlen.h".
(simplify_builtin_call): Inline memchr with constant strings of
no more than the bytes of a word.
* tree-ssa-strlen.cc (use_in_zero_equality): Make it global.
* tree-ssa-strlen.h (use_in_zero_equality): New.

gcc/testsuite/

PR tree-optimization/103798
* c-c++-common/pr103798-1.c: New test.
* c-c++-common/pr103798-2.c: Likewise.
* c-c++-common/pr103798-3.c: Likewise.
* c-c++-common/pr103798-4.c: Likewise.
* c-c++-common/pr103798-5.c: Likewise.
* c-c++-common/pr103798-6.c: Likewise.
* c-c++-common/pr103798-7.c: Likewise.
* c-c++-common/pr103798-8.c: Likewise.
* c-c++-common/pr103798-9.c: Likewise.
* c-c++-common/pr103798-10.c: Likewise.
---
 gcc/testsuite/c-c++-common/pr103798-1.c  | 28 +
 gcc/testsuite/c-c++-common/pr103798-10.c | 10 
 gcc/testsuite/c-c++-common/pr103798-2.c  | 30 ++
 gcc/testsuite/c-c++-common/pr103798-3.c  | 28 +
 gcc/testsuite/c-c++-common/pr103798-4.c  | 28 +
 gcc/testsuite/c-c++-common/pr103798-5.c  | 26 +
 gcc/testsuite/c-c++-common/pr103798-6.c  | 27 +
 gcc/testsuite/c-c++-common/pr103798-7.c  | 27 +
 gcc/testsuite/c-c++-common/pr103798-8.c  | 27 +
 gcc/testsuite/c-c++-common/pr103798-9.c  | 10 
 gcc/tree-ssa-forwprop.cc | 73 
 gcc/tree-ssa-strlen.cc   |  4 +-
 gcc/tree-ssa-strlen.h|  2 +
 13 files changed, 318 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/c-c++-common/pr103798-1.c
 create mode 100644 gcc/testsuite/c-c++-common/pr103798-10.c
 create mode 100644 gcc/testsuite/c-c++-common/pr103798-2.c
 create mode 100644 gcc/testsuite/c-c++-common/pr103798-3.c
 create mode 100644 gcc/testsuite/c-c++-common/pr103798-4.c
 create mode 100644 gcc/testsuite/c-c++-common/pr103798-5.c
 create mode 100644 gcc/testsuite/c-c++-common/pr103798-6.c
 create mode 100644 gcc/testsuite/c-c++-common/pr103798-7.c
 create mode 100644 gcc/testsuite/c-c++-common/pr103798-8.c
 create mode 100644 gcc/testsuite/c-c++-common/pr103798-9.c

diff --git a/gcc/testsuite/c-c++-common/pr103798-1.c 
b/gcc/testsuite/c-c++-common/pr103798-1.c
new file mode 100644
index 000..cd3edf569fc
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/pr103798-1.c
@@ -0,0 +1,28 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fdump-tree-optimized -save-temps" } */
+
+__attribute__ ((weak))
+int
+f (char a)
+{
+   return  __builtin_memchr ("a", a, 1) == 0;
+}
+
+__attribute__ ((weak))
+int
+g (char a)
+{
+  return a != 'a';
+}
+
+int
+main ()
+{
+ for (int i = 0; i < 255; i++)
+   if (f (i) != g (i))
+ __builtin_abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-assembler-not "memchr" } } */
diff --git a/gcc/testsuite/c-c++-common/pr103798-10.c 
b/gcc/testsuite/c-c++-common/pr103798-10.c
new file mode 100644
index 000..4677e9539fa
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/pr103798-10.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-Os -fdump-tree-optimized -save-temps" } */
+
+int
+f (char a)
+{
+  return  __builtin_memchr ("ac", a, 1) == 0;
+}
+
+/* { dg-final { scan-assembler "memchr" } } */
diff --git a/gcc/testsuite/c-c++-common/pr103798-2.c 
b/gcc/testsuite/c-c++-common/pr103798-2.c
new file mode 100644
index 000..e7e99c3679e
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/pr103798-2.c
@@ -0,0 +1,30 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fdump-tree-optimized -save-temps" } */
+
+#include 
+
+__attribute__ ((weak))
+int
+f (int a)
+{
+   return memchr ("aE", a, 2) != NULL;
+}
+
+__attribute__ ((weak))
+int
+g (char a)
+{
+  return a == 'a' || a == 'E';
+}
+
+int
+main ()
+{
+ for (int i = 0; i < 255; i++)
+   if (f (i + 256) != g (i + 256))
+ __builtin_abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-assembler-not "memchr" } } */
diff --git a/gcc/testsuite/c-c++-common/pr103798-3.c 
b/gcc/testsuite/c-c++-common/pr103798-3.c
new file mode 100644
index 000..ddcedc7e238
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/pr103798-3.c
@@ -0,0 +1,28 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fdump-tree-optimized -save-temps" } */
+
+__attribute__ ((weak))
+int
+f (char a)
+{
+   return  __builtin_memchr ("aEgZ", a, 3) == 0;
+}
+
+__attribute__ ((weak))
+int
+g (char a)
+{
+  return a != 'a' && a != 'E' && a != 'g';
+}
+
+int
+main ()
+{
+ for (int i = 0; i < 255; i++)
+   if (f (i) != g (i))
+ __builtin_abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-assembler-not "memchr" } } */
diff --git 

Re: [PATCH v2] Simplify memchr with small constant strings

2022-07-12 Thread H.J. Lu via Gcc-patches
On Fri, Jul 8, 2022 at 5:54 AM Richard Biener
 wrote:
>
> On Thu, Jul 7, 2022 at 6:45 PM H.J. Lu  wrote:
> >
> > When memchr is applied on a constant string of no more than the bytes of
> > a word, simplify memchr by checking each byte in the constant string.
> >
> > int f (int a)
> > {
> >return  __builtin_memchr ("AE", a, 2) != 0;
> > }
> >
> > is simplified to
> >
> > int f (int a)
> > {
> >   return ((char) a == 'A' || (char) a == 'E') != 0;
> > }
> >
> > gcc/
> >
> > PR tree-optimization/103798
> > * tree-ssa-forwprop.cc: Include "tree-ssa-strlen.h".
> > (simplify_builtin_call): Inline memchr with constant strings of
> > no more than the bytes of a word.
> > * tree-ssa-strlen.cc (use_in_zero_equality): Make it global.
> > * tree-ssa-strlen.h (use_in_zero_equality): New.
> >
> > gcc/testsuite/
> >
> > PR tree-optimization/103798
> > * c-c++-common/pr103798-1.c: New test.
> > * c-c++-common/pr103798-2.c: Likewise.
> > * c-c++-common/pr103798-3.c: Likewise.
> > * c-c++-common/pr103798-4.c: Likewise.
> > * c-c++-common/pr103798-5.c: Likewise.
> > * c-c++-common/pr103798-6.c: Likewise.
> > * c-c++-common/pr103798-7.c: Likewise.
> > * c-c++-common/pr103798-8.c: Likewise.
> > ---
> >  gcc/testsuite/c-c++-common/pr103798-1.c | 28 +++
> >  gcc/testsuite/c-c++-common/pr103798-2.c | 30 
> >  gcc/testsuite/c-c++-common/pr103798-3.c | 28 +++
> >  gcc/testsuite/c-c++-common/pr103798-4.c | 28 +++
> >  gcc/testsuite/c-c++-common/pr103798-5.c | 26 ++
> >  gcc/testsuite/c-c++-common/pr103798-6.c | 27 +++
> >  gcc/testsuite/c-c++-common/pr103798-7.c | 27 +++
> >  gcc/testsuite/c-c++-common/pr103798-8.c | 27 +++
> >  gcc/tree-ssa-forwprop.cc| 64 +
> >  gcc/tree-ssa-strlen.cc  |  4 +-
> >  gcc/tree-ssa-strlen.h   |  2 +
> >  11 files changed, 289 insertions(+), 2 deletions(-)
> >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-1.c
> >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-2.c
> >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-3.c
> >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-4.c
> >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-5.c
> >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-6.c
> >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-7.c
> >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-8.c
> >
> > diff --git a/gcc/testsuite/c-c++-common/pr103798-1.c 
> > b/gcc/testsuite/c-c++-common/pr103798-1.c
> > new file mode 100644
> > index 000..cd3edf569fc
> > --- /dev/null
> > +++ b/gcc/testsuite/c-c++-common/pr103798-1.c
> > @@ -0,0 +1,28 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -fdump-tree-optimized -save-temps" } */
> > +
> > +__attribute__ ((weak))
> > +int
> > +f (char a)
> > +{
> > +   return  __builtin_memchr ("a", a, 1) == 0;
> > +}
> > +
> > +__attribute__ ((weak))
> > +int
> > +g (char a)
> > +{
> > +  return a != 'a';
> > +}
> > +
> > +int
> > +main ()
> > +{
> > + for (int i = 0; i < 255; i++)
> > +   if (f (i) != g (i))
> > + __builtin_abort ();
> > +
> > + return 0;
> > +}
> > +
> > +/* { dg-final { scan-assembler-not "memchr" } } */
> > diff --git a/gcc/testsuite/c-c++-common/pr103798-2.c 
> > b/gcc/testsuite/c-c++-common/pr103798-2.c
> > new file mode 100644
> > index 000..e7e99c3679e
> > --- /dev/null
> > +++ b/gcc/testsuite/c-c++-common/pr103798-2.c
> > @@ -0,0 +1,30 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -fdump-tree-optimized -save-temps" } */
> > +
> > +#include 
> > +
> > +__attribute__ ((weak))
> > +int
> > +f (int a)
> > +{
> > +   return memchr ("aE", a, 2) != NULL;
> > +}
> > +
> > +__attribute__ ((weak))
> > +int
> > +g (char a)
> > +{
> > +  return a == 'a' || a == 'E';
> > +}
> > +
> > +int
> > +main ()
> > +{
> > + for (int i = 0; i < 255; i++)
> > +   if (f (i + 256) != g (i + 256))
> > + __builtin_abort ();
> > +
> > + return 0;
> > +}
> > +
> > +/* { dg-final { scan-assembler-not "memchr" } } */
> > diff --git a/gcc/testsuite/c-c++-common/pr103798-3.c 
> > b/gcc/testsuite/c-c++-common/pr103798-3.c
> > new file mode 100644
> > index 000..ddcedc7e238
> > --- /dev/null
> > +++ b/gcc/testsuite/c-c++-common/pr103798-3.c
> > @@ -0,0 +1,28 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -fdump-tree-optimized -save-temps" } */
> > +
> > +__attribute__ ((weak))
> > +int
> > +f (char a)
> > +{
> > +   return  __builtin_memchr ("aEgZ", a, 3) == 0;
> > +}
> > +
> > +__attribute__ ((weak))
> > +int
> > +g (char a)
> > +{
> > +  return a != 'a' && a != 'E' && a != 'g';
> > +}
> > +
> > +int
> > +main ()
> > +{
> > + for (int i = 0; i < 255; i++)
> > +   if (f (i) != g (i))
> > + __builtin_abort ();
> > +
> > + return 0;
> > +}
> > +
> > +/* { dg-final { scan-assembler-not "memchr" } } */
> > diff 

Re: [x86_64 PATCH] Improved Scalar-To-Vector (STV) support for TImode to V1TImode.

2022-07-10 Thread H.J. Lu via Gcc-patches
On Sun, Jul 10, 2022 at 2:38 PM Roger Sayle  wrote:
>
>
> Hi HJ,
>
> I believe this should now be handled by the post-reload (CSE) pass.
> Consider the simple test case:
>
> __int128 a, b, c;
> void foo()
> {
>   a = 0;
>   b = 0;
>   c = 0;
> }
>
> Without any STV, i.e. -O2 -msse4 -mno-stv, GCC get TI mode writes:
> movq$0, a(%rip)
> movq$0, a+8(%rip)
> movq$0, b(%rip)
> movq$0, b+8(%rip)
> movq$0, c(%rip)
> movq$0, c+8(%rip)
> ret
>
> But with STV, i.e. -O2 -msse4, things get converted to V1TI mode:
> pxor%xmm0, %xmm0
> movaps  %xmm0, a(%rip)
> movaps  %xmm0, b(%rip)
> movaps  %xmm0, c(%rip)
> ret
>
> You're quite right internally the STV actually generates the equivalent of:
> pxor%xmm0, %xmm0
> movaps  %xmm0, a(%rip)
> pxor%xmm0, %xmm0
> movaps  %xmm0, b(%rip)
> pxor%xmm0, %xmm0
> movaps  %xmm0, c(%rip)
> ret
>
> And currently because STV run before cse2 and combine, the const0_rtx
> gets CSE'd be the cse2 pass to produce the code we see.  However, if you
> specify -fno-rerun-cse-after-loop (to disable the cse2 pass), you'll see we
> continue to generate the same optimized code, as the same const0_rtx
> gets CSE'd in postreload.
>
> I can't be certain until I try the experiment, but I believe that the 
> postreload
> CSE will clean-up, all of the same common subexpressions.  Hence, it should
> be safe to perform all STV at the same point (after combine), which for a few
> additional optimizations.
>
> Does this make sense?  Do you have a test case, -fno-rerun-cse-after-loop
> produces different/inferior code for TImode STV chains?
>
> My guess is that the RTL passes have changed so much in the last six or
> seven years, that some of the original motivation no longer applies.
> Certainly we now try to keep TI mode operations visible longer, and
> then allow STV to behave like a pre-reload pass to decide which set of
> registers to use (vector V1TI or scalar doubleword DI).  Any CSE opportunities
> that cse2 finds with V1TI mode, could/should equally well be found for
> TI mode (mostly).

You are probably right.  If there are no regressions in GCC testsuite,
my original motivation is no longer valid.

Thanks.

> Cheers,
> Roger
> --
>
> > -Original Message-
> > From: H.J. Lu 
> > Sent: 10 July 2022 20:15
> > To: Roger Sayle 
> > Cc: Uros Bizjak ; GCC Patches 
> > Subject: Re: [x86_64 PATCH] Improved Scalar-To-Vector (STV) support for
> > TImode to V1TImode.
> >
> > On Sun, Jul 10, 2022 at 11:36 AM Roger Sayle 
> > wrote:
> > >
> > >
> > > Hi Uros,
> > > Yes, I agree.  I think it makes sense to have a single STV pass (after
> > > combine and before reload).  Let's hear what HJ thinks, but I'm happy
> > > to investigate a follow-up patch that unifies the STV passes.
> > > But it'll be easier to confirm there are no "code generation" changes
> > > if those modifications are pushed independently of these ones.
> > > Time to look into the (git) history of multiple STV passes...
> > >
> > > Thanks for the review.  I'll wait for HJ's thoughts.
> >
> > The TImode STV pass is run before the CSE pass so that instructions changed 
> > or
> > generated by the STV pass can be CSEed.
> >
> > > Cheers,
> > > Roger
> > > --
> > >
> > > > -Original Message-
> > > > From: Uros Bizjak 
> > > > Sent: 10 July 2022 19:06
> > > > To: Roger Sayle 
> > > > Cc: gcc-patches@gcc.gnu.org; H. J. Lu 
> > > > Subject: Re: [x86_64 PATCH] Improved Scalar-To-Vector (STV) support
> > > > for TImode to V1TImode.
> > > >
> > > > On Sat, Jul 9, 2022 at 2:17 PM Roger Sayle
> > > > 
> > > > wrote:
> > > > >
> > > > >
> > > > > This patch upgrades x86_64's scalar-to-vector (STV) pass to more
> > > > > aggressively transform 128-bit scalar TImode operations into
> > > > > vector V1TImode operations performed on SSE registers.  TImode
> > > > > functionality already exists in STV, but only for move operations,
> > > > > this changes brings support for logical operations (AND, IOR, XOR,
> > > > > NOT and ANDN) and comparisons.
> > > > >
> > > > > The effect of these changes are conveniently demonstrated by the
> > > > > new sse4_1-stv-5.c test case:
> > > > >
> > > > > __int128 a[16];
> > > > > __int128 b[16];
> > > > > __int128 c[16];
> > > > >
> > > > > void foo()
> > > > > {
> > > > >   for (unsigned int i=0; i<16; i++)
> > > > > a[i] = b[i] & ~c[i];
> > > > > }
> > > > >
> > > > > which when currently compiled on mainline wtih -O2 -msse4 produces:
> > > > >
> > > > > foo:xorl%eax, %eax
> > > > > .L2:movqc(%rax), %rsi
> > > > > movqc+8(%rax), %rdi
> > > > > addq$16, %rax
> > > > > notq%rsi
> > > > > notq%rdi
> > > > > andqb-16(%rax), %rsi
> > > > > andqb-8(%rax), %rdi
> > > > > movq%rsi, a-16(%rax)
> > > > > movq%rdi, 

Re: [x86_64 PATCH] Improved Scalar-To-Vector (STV) support for TImode to V1TImode.

2022-07-10 Thread H.J. Lu via Gcc-patches
On Sun, Jul 10, 2022 at 11:36 AM Roger Sayle  wrote:
>
>
> Hi Uros,
> Yes, I agree.  I think it makes sense to have a single STV pass (after
> combine and before reload).  Let's hear what HJ thinks, but I'm
> happy to investigate a follow-up patch that unifies the STV passes.
> But it'll be easier to confirm there are no "code generation" changes
> if those modifications are pushed independently of these ones.
> Time to look into the (git) history of multiple STV passes...
>
> Thanks for the review.  I'll wait for HJ's thoughts.

The TImode STV pass is run before the CSE pass so that
instructions changed or generated by the STV pass can be CSEed.

> Cheers,
> Roger
> --
>
> > -Original Message-
> > From: Uros Bizjak 
> > Sent: 10 July 2022 19:06
> > To: Roger Sayle 
> > Cc: gcc-patches@gcc.gnu.org; H. J. Lu 
> > Subject: Re: [x86_64 PATCH] Improved Scalar-To-Vector (STV) support for
> > TImode to V1TImode.
> >
> > On Sat, Jul 9, 2022 at 2:17 PM Roger Sayle 
> > wrote:
> > >
> > >
> > > This patch upgrades x86_64's scalar-to-vector (STV) pass to more
> > > aggressively transform 128-bit scalar TImode operations into vector
> > > V1TImode operations performed on SSE registers.  TImode functionality
> > > already exists in STV, but only for move operations, this changes
> > > brings support for logical operations (AND, IOR, XOR, NOT and ANDN)
> > > and comparisons.
> > >
> > > The effect of these changes are conveniently demonstrated by the new
> > > sse4_1-stv-5.c test case:
> > >
> > > __int128 a[16];
> > > __int128 b[16];
> > > __int128 c[16];
> > >
> > > void foo()
> > > {
> > >   for (unsigned int i=0; i<16; i++)
> > > a[i] = b[i] & ~c[i];
> > > }
> > >
> > > which when currently compiled on mainline wtih -O2 -msse4 produces:
> > >
> > > foo:xorl%eax, %eax
> > > .L2:movqc(%rax), %rsi
> > > movqc+8(%rax), %rdi
> > > addq$16, %rax
> > > notq%rsi
> > > notq%rdi
> > > andqb-16(%rax), %rsi
> > > andqb-8(%rax), %rdi
> > > movq%rsi, a-16(%rax)
> > > movq%rdi, a-8(%rax)
> > > cmpq$256, %rax
> > > jne .L2
> > > ret
> > >
> > > but with this patch now produces:
> > >
> > > foo:xorl%eax, %eax
> > > .L2:movdqa  c(%rax), %xmm0
> > > pandn   b(%rax), %xmm0
> > > addq$16, %rax
> > > movaps  %xmm0, a-16(%rax)
> > > cmpq$256, %rax
> > > jne .L2
> > > ret
> > >
> > > Technically, the STV pass is implemented by three C++ classes, a
> > > common abstract base class "scalar_chain" that contains common
> > > functionality, and two derived classes: general_scalar_chain (which
> > > handles SI and DI modes) and timode_scalar_chain (which handles TI
> > > modes).  As mentioned previously, because only TI mode moves were
> > > handled the two worker classes behaved significantly differently.
> > > These changes bring the functionality of these two classes closer
> > > together, which is reflected by refactoring more shared code from
> > > general_scalar_chain to the parent scalar_chain and reusing it from
> > > timode.  There still remain significant differences (and
> > > simplifications) so the existing division of classes (as specializations) 
> > > continues
> > to make sense.
> >
> > Please note that there are in fact two STV passes, one before combine and 
> > the
> > other after combine. The TImode pass that previously handled only loads and
> > stores is positioned before combine (there was a reason for this decision, 
> > but I
> > don't remember the details - let's ask HJ...). However, DImode STV pass
> > transforms much more instructions and the reason it was positioned after the
> > combine pass was that STV pass transforms optimized insn stream where
> > forward propagation was already performed.
> >
> > What is not clear to me from the above explanation is: is the new TImode STV
> > pass positioned after the combine pass, and if this is the case, how the 
> > change
> > affects current load/store TImode STV pass. I must admit, I don't like two
> > separate STV passess, so if TImode is now similar to DImode, I suggest we
> > abandon STV1 pass and do everything concerning TImode after the combine
> > pass. HJ, what is your opinion on this?
> >
> > Other than the above, the patch LGTM to me.
> >
> > Uros.
> >
> > > Obviously, there are more changes to come (shifts and rotates), and
> > > compute_convert_gain doesn't yet have its final (tuned) form, but is
> > > already an improvement over the "return 1;" used previously.
> > >
> > > This patch has been tested on x86_64-pc-linux-gnu with make boostrap
> > > and make -k check, both with and without --target_board=unix{-m32}
> > > with no new failures.  Ok for mainline?
> > >
> > >
> > > 2022-07-09  Roger Sayle  
> > >
> > > gcc/ChangeLog
> > > * config/i386/i386-features.h (scalar_chain): Add fields
> > > insns_conv, 

Re: [PATCH] Inline memchr with a small constant string

2022-07-07 Thread H.J. Lu via Gcc-patches
On Thu, Jun 23, 2022 at 9:26 AM H.J. Lu  wrote:
>
> On Wed, Jun 22, 2022 at 11:03 PM Richard Biener
>  wrote:
> >
> > On Wed, Jun 22, 2022 at 7:13 PM H.J. Lu  wrote:
> > >
> > > On Wed, Jun 22, 2022 at 4:39 AM Richard Biener
> > >  wrote:
> > > >
> > > > On Tue, Jun 21, 2022 at 11:03 PM H.J. Lu via Gcc-patches
> > > >  wrote:
> > > > >
> > > > > When memchr is applied on a constant string of no more than the bytes 
> > > > > of
> > > > > a word, inline memchr by checking each byte in the constant string.
> > > > >
> > > > > int f (int a)
> > > > > {
> > > > >return  __builtin_memchr ("eE", a, 2) != 0;
> > > > > }
> > > > >
> > > > > is simplified to
> > > > >
> > > > > int f (int a)
> > > > > {
> > > > >   return (char) a == 'e' || (char) a == 'E';
> > > > > }
> > > > >
> > > > > gcc/
> > > > >
> > > > > PR tree-optimization/103798
> > > > > * match.pd (__builtin_memchr (const_str, a, N)): Inline memchr
> > > > > with constant strings of no more than the bytes of a word.
> > > >
> > > > Please do this in strlenopt or so, with match.pd you will end up moving
> > > > the memchr loads across possible aliasing stores to the point of the
> > > > comparison.
> > >
> > > strlenopt is run after many other passes.  The code won't be well 
> > > optimized.
> >
> > What followup optimizations do you expect?  That is, other builtins are only
>
> reassociation and dce turn
>
>   _5 = a_2(D) == 101;
>   _6 = a_2(D) == 69;
>   _1 = _5 | _6;
>   _4 = (int) _1;
>
> into
>
>   _7 = a_2(D) & -33;
>   _8 = _7 == 69;
>   _1 = _8;
>   _4 = (int) _1;
>
> > expanded inline at RTL expansion time?
>
> Some high level optimizations will be missed and
> TARGET_GIMPLE_FOLD_BUILTIN improves builtins
> codegen.
>
> > > Since we are only optimizing
> > >
> > > __builtin_memchr ("eE", a, 2) != 0;
> > >
> > > I don't see any aliasing store issues here.
> >
> > Ah, I failed to see the STRING_CST restriction.  Note that when optimizing 
> > for
> > size this doesn't look very good.
>
> True.
>
> > I would expect a target might produce some vector code for
> > memchr ("aAbBcCdDeE...", c, 9) != 0 by splatting 'c', doing
> > a v16qimode compare, masking off excess elements beyond length
> > and then comparing against zero or for == 0 against all-ones.
> >
> > The repetitive pattern result also suggests an implementation elsewhere,
> > if you think strlenopt is too late there would be forwprop as well.
>
> forwprop seems a good place.

The v2 patch is at

https://gcc.gnu.org/pipermail/gcc-patches/2022-July/598022.html

Thanks.

-- 
H.J.


[PATCH v2] Simplify memchr with small constant strings

2022-07-07 Thread H.J. Lu via Gcc-patches
When memchr is applied on a constant string of no more than the bytes of
a word, simplify memchr by checking each byte in the constant string.

int f (int a)
{
   return  __builtin_memchr ("AE", a, 2) != 0;
}

is simplified to

int f (int a)
{
  return ((char) a == 'A' || (char) a == 'E') != 0;
}

gcc/

PR tree-optimization/103798
* tree-ssa-forwprop.cc: Include "tree-ssa-strlen.h".
(simplify_builtin_call): Inline memchr with constant strings of
no more than the bytes of a word.
* tree-ssa-strlen.cc (use_in_zero_equality): Make it global.
* tree-ssa-strlen.h (use_in_zero_equality): New.

gcc/testsuite/

PR tree-optimization/103798
* c-c++-common/pr103798-1.c: New test.
* c-c++-common/pr103798-2.c: Likewise.
* c-c++-common/pr103798-3.c: Likewise.
* c-c++-common/pr103798-4.c: Likewise.
* c-c++-common/pr103798-5.c: Likewise.
* c-c++-common/pr103798-6.c: Likewise.
* c-c++-common/pr103798-7.c: Likewise.
* c-c++-common/pr103798-8.c: Likewise.
---
 gcc/testsuite/c-c++-common/pr103798-1.c | 28 +++
 gcc/testsuite/c-c++-common/pr103798-2.c | 30 
 gcc/testsuite/c-c++-common/pr103798-3.c | 28 +++
 gcc/testsuite/c-c++-common/pr103798-4.c | 28 +++
 gcc/testsuite/c-c++-common/pr103798-5.c | 26 ++
 gcc/testsuite/c-c++-common/pr103798-6.c | 27 +++
 gcc/testsuite/c-c++-common/pr103798-7.c | 27 +++
 gcc/testsuite/c-c++-common/pr103798-8.c | 27 +++
 gcc/tree-ssa-forwprop.cc| 64 +
 gcc/tree-ssa-strlen.cc  |  4 +-
 gcc/tree-ssa-strlen.h   |  2 +
 11 files changed, 289 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/c-c++-common/pr103798-1.c
 create mode 100644 gcc/testsuite/c-c++-common/pr103798-2.c
 create mode 100644 gcc/testsuite/c-c++-common/pr103798-3.c
 create mode 100644 gcc/testsuite/c-c++-common/pr103798-4.c
 create mode 100644 gcc/testsuite/c-c++-common/pr103798-5.c
 create mode 100644 gcc/testsuite/c-c++-common/pr103798-6.c
 create mode 100644 gcc/testsuite/c-c++-common/pr103798-7.c
 create mode 100644 gcc/testsuite/c-c++-common/pr103798-8.c

diff --git a/gcc/testsuite/c-c++-common/pr103798-1.c 
b/gcc/testsuite/c-c++-common/pr103798-1.c
new file mode 100644
index 000..cd3edf569fc
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/pr103798-1.c
@@ -0,0 +1,28 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fdump-tree-optimized -save-temps" } */
+
+__attribute__ ((weak))
+int
+f (char a)
+{
+   return  __builtin_memchr ("a", a, 1) == 0;
+}
+
+__attribute__ ((weak))
+int
+g (char a)
+{
+  return a != 'a';
+}
+
+int
+main ()
+{
+ for (int i = 0; i < 255; i++)
+   if (f (i) != g (i))
+ __builtin_abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-assembler-not "memchr" } } */
diff --git a/gcc/testsuite/c-c++-common/pr103798-2.c 
b/gcc/testsuite/c-c++-common/pr103798-2.c
new file mode 100644
index 000..e7e99c3679e
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/pr103798-2.c
@@ -0,0 +1,30 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fdump-tree-optimized -save-temps" } */
+
+#include 
+
+__attribute__ ((weak))
+int
+f (int a)
+{
+   return memchr ("aE", a, 2) != NULL;
+}
+
+__attribute__ ((weak))
+int
+g (char a)
+{
+  return a == 'a' || a == 'E';
+}
+
+int
+main ()
+{
+ for (int i = 0; i < 255; i++)
+   if (f (i + 256) != g (i + 256))
+ __builtin_abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-assembler-not "memchr" } } */
diff --git a/gcc/testsuite/c-c++-common/pr103798-3.c 
b/gcc/testsuite/c-c++-common/pr103798-3.c
new file mode 100644
index 000..ddcedc7e238
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/pr103798-3.c
@@ -0,0 +1,28 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fdump-tree-optimized -save-temps" } */
+
+__attribute__ ((weak))
+int
+f (char a)
+{
+   return  __builtin_memchr ("aEgZ", a, 3) == 0;
+}
+
+__attribute__ ((weak))
+int
+g (char a)
+{
+  return a != 'a' && a != 'E' && a != 'g';
+}
+
+int
+main ()
+{
+ for (int i = 0; i < 255; i++)
+   if (f (i) != g (i))
+ __builtin_abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-assembler-not "memchr" } } */
diff --git a/gcc/testsuite/c-c++-common/pr103798-4.c 
b/gcc/testsuite/c-c++-common/pr103798-4.c
new file mode 100644
index 000..00e8302a833
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/pr103798-4.c
@@ -0,0 +1,28 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fdump-tree-optimized -save-temps" } */
+
+__attribute__ ((weak))
+int
+f (char a)
+{
+   return  __builtin_memchr ("aEgi", a, 4) != 0;
+}
+
+__attribute__ ((weak))
+int
+g (char a)
+{
+  return a == 'a' || a == 'E' || a == 'g' || a == 'i';
+}
+
+int
+main ()
+{
+ for (int i = 0; i < 255; i++)
+   if (f (i) != g (i))
+ __builtin_abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-assembler-not "memchr" } } */
diff --git a/gcc/testsuite/c-c++-common/pr103798-5.c 

Re: [PATCH v2] Enable __memcmpeq after seeing __memcmpeq prototype

2022-07-05 Thread H.J. Lu via Gcc-patches
On Fri, Jul 1, 2022 at 12:51 AM Richard Biener
 wrote:
>
> On Mon, Jun 20, 2022 at 5:44 PM H.J. Lu  wrote:
> >
> > extern int __memcmpeq (const void *, const void *, size_t);
> >
> > was was added to GLIBC 2.35.  Expand BUILT_IN_MEMCMP_EQ to __memcmpeq
> > after seeing __memcmpeq prototype
>
> Can you instead use builtin_decl_declared_p (), see how frontends
> set that via set_builtin_decl_declared_p?

The v3 patch is at

https://gcc.gnu.org/pipermail/gcc-patches/2022-July/597861.html

Thanks.

> > gcc/
> >
> > * builtins.cc (have_memcmpeq_prototype): New.
> > (expand_builtin): Issue an error for BUILT_IN___MEMCMPEQ if
> > there is no __memcmpeq prototype.  Expand BUILT_IN_MEMCMP_EQ
> > to BUILT_IN___MEMCMP_EQ if there is __memcmpeq prototype.
> > * builtins.def (BUILT_IN___MEMCMPEQ): New.
> > * builtins.h (have_memcmpeq_prototype): New.
> >
> > gcc/c/
> >
> > * c-decl.cc (diagnose_mismatched_decls): Set
> > have_memcmpeq_prototype to true after seeing __memcmpeq prototype.
> >
> > gcc/cp/
> >
> > *  decl.cc (duplicate_decls): Set have_memcmpeq_prototype to true
> > after seeing __memcmpeq prototype.
> >
> > gcc/testsuite/
> >
> > * c-c++-common/memcmpeq-1.c: New test.
> > * c-c++-common/memcmpeq-2.c: Likewise.
> > * c-c++-common/memcmpeq-3.c: Likewise.
> > * c-c++-common/memcmpeq-4.c: Likewise.
> > * c-c++-common/memcmpeq-5.c: Likewise.
> > * c-c++-common/memcmpeq-6.c: Likewise.
> > * c-c++-common/memcmpeq.h: Likewise.
> > ---
> >  gcc/builtins.cc | 17 -
> >  gcc/builtins.def|  3 +++
> >  gcc/builtins.h  |  3 +++
> >  gcc/c/c-decl.cc | 25 ++---
> >  gcc/cp/decl.cc  |  5 +
> >  gcc/testsuite/c-c++-common/memcmpeq-1.c | 11 +++
> >  gcc/testsuite/c-c++-common/memcmpeq-2.c | 11 +++
> >  gcc/testsuite/c-c++-common/memcmpeq-3.c | 11 +++
> >  gcc/testsuite/c-c++-common/memcmpeq-4.c | 11 +++
> >  gcc/testsuite/c-c++-common/memcmpeq-5.c | 11 +++
> >  gcc/testsuite/c-c++-common/memcmpeq-6.c | 10 ++
> >  gcc/testsuite/c-c++-common/memcmpeq.h   | 11 +++
> >  12 files changed, 121 insertions(+), 8 deletions(-)
> >  create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-1.c
> >  create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-2.c
> >  create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-3.c
> >  create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-4.c
> >  create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-5.c
> >  create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-6.c
> >  create mode 100644 gcc/testsuite/c-c++-common/memcmpeq.h
> >
> > diff --git a/gcc/builtins.cc b/gcc/builtins.cc
> > index 971b18c3745..96e283e5847 100644
> > --- a/gcc/builtins.cc
> > +++ b/gcc/builtins.cc
> > @@ -104,6 +104,9 @@ builtin_info_type builtin_info[(int)END_BUILTINS];
> >  /* Non-zero if __builtin_constant_p should be folded right away.  */
> >  bool force_folding_builtin_constant_p;
> >
> > +/* True if there is a __memcmpeq prototype.  */
> > +bool have_memcmpeq_prototype;
> > +
> >  static int target_char_cast (tree, char *);
> >  static int apply_args_size (void);
> >  static int apply_result_size (void);
> > @@ -7392,6 +7395,15 @@ expand_builtin (tree exp, rtx target, rtx subtarget, 
> > machine_mode mode,
> > return target;
> >break;
> >
> > +case BUILT_IN___MEMCMPEQ:
> > +  if (!have_memcmpeq_prototype)
> > +   {
> > + error ("use of %<__builtin___memcmpeq ()%> without "
> > +"%<__memcmpeq%> prototype");
> > + return const0_rtx;
> > +   }
> > +  break;
> > +
> >  /* Expand it as BUILT_IN_MEMCMP_EQ first. If not successful, change it
> > back to a BUILT_IN_STRCMP. Remember to delete the 3rd parameter
> > when changing it to a strcmp call.  */
> > @@ -7445,7 +7457,10 @@ expand_builtin (tree exp, rtx target, rtx subtarget, 
> > machine_mode mode,
> > return target;
> >if (fcode == BUILT_IN_MEMCMP_EQ)
> > {
> > - tree newdecl = builtin_decl_explicit (BUILT_IN_MEMCMP);
> > + tree newdecl = builtin_decl_explicit
> > +   (have_memcmpeq_prototype
> > +? BUILT_IN___MEMCMPEQ
> > +: BUILT_IN_MEMCMP);
> >   TREE_OPERAND (exp, 1) = build_fold_addr_expr (newdecl);
> > }
> >break;
> > diff --git a/gcc/builtins.def b/gcc/builtins.def
> > index 005976f34e9..95642c6acdf 100644
> > --- a/gcc/builtins.def
> > +++ b/gcc/builtins.def
> > @@ -965,6 +965,9 @@ DEF_BUILTIN_STUB (BUILT_IN_ALLOCA_WITH_ALIGN_AND_MAX, 
> > "__builtin_alloca_with_ali
> > equality with zero.  */
> >  DEF_BUILTIN_STUB (BUILT_IN_MEMCMP_EQ, "__builtin_memcmp_eq")
> >
> > +/* Similar to BUILT_IN_MEMCMP_EQ, but is mapped to __memcmpeq. 

[PATCH v3] Enable __memcmpeq after seeing __memcmpeq prototype

2022-07-05 Thread H.J. Lu via Gcc-patches
extern int __memcmpeq (const void *, const void *, size_t);

was was added to GLIBC 2.35.  Expand BUILT_IN_MEMCMP_EQ to __memcmpeq
after seeing __memcmpeq prototype

gcc/

* builtins.cc (expand_builtin): Issue an error for
BUILT_IN___MEMCMPEQ if there is no __memcmpeq prototype.  Expand
BUILT_IN_MEMCMP_EQ to BUILT_IN___MEMCMP_EQ if there is __memcmpeq
prototype.
* builtins.def (BUILT_IN___MEMCMPEQ): New.

gcc/testsuite/

* c-c++-common/memcmpeq-1.c: New test.
* c-c++-common/memcmpeq-2.c: Likewise.
* c-c++-common/memcmpeq-3.c: Likewise.
* c-c++-common/memcmpeq-4.c: Likewise.
* c-c++-common/memcmpeq-5.c: Likewise.
* c-c++-common/memcmpeq-6.c: Likewise.
* c-c++-common/memcmpeq.h: Likewise.
---
 gcc/builtins.cc | 14 +-
 gcc/builtins.def|  3 +++
 gcc/testsuite/c-c++-common/memcmpeq-1.c | 11 +++
 gcc/testsuite/c-c++-common/memcmpeq-2.c | 11 +++
 gcc/testsuite/c-c++-common/memcmpeq-3.c | 11 +++
 gcc/testsuite/c-c++-common/memcmpeq-4.c | 11 +++
 gcc/testsuite/c-c++-common/memcmpeq-5.c | 11 +++
 gcc/testsuite/c-c++-common/memcmpeq-6.c | 10 ++
 gcc/testsuite/c-c++-common/memcmpeq.h   | 11 +++
 9 files changed, 92 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-1.c
 create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-2.c
 create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-3.c
 create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-4.c
 create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-5.c
 create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-6.c
 create mode 100644 gcc/testsuite/c-c++-common/memcmpeq.h

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index e6816d5c865..2254a597bec 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -7395,6 +7395,15 @@ expand_builtin (tree exp, rtx target, rtx subtarget, 
machine_mode mode,
return target;
   break;
 
+case BUILT_IN___MEMCMPEQ:
+  if (!builtin_decl_declared_p (BUILT_IN___MEMCMPEQ))
+   {
+ error ("use of %<__builtin___memcmpeq ()%> without "
+"%<__memcmpeq%> prototype");
+ return const0_rtx;
+   }
+  break;
+
 /* Expand it as BUILT_IN_MEMCMP_EQ first. If not successful, change it
back to a BUILT_IN_STRCMP. Remember to delete the 3rd parameter
when changing it to a strcmp call.  */
@@ -7448,7 +7457,10 @@ expand_builtin (tree exp, rtx target, rtx subtarget, 
machine_mode mode,
return target;
   if (fcode == BUILT_IN_MEMCMP_EQ)
{
- tree newdecl = builtin_decl_explicit (BUILT_IN_MEMCMP);
+ tree newdecl = builtin_decl_explicit
+   (builtin_decl_declared_p (BUILT_IN___MEMCMPEQ)
+? BUILT_IN___MEMCMPEQ
+: BUILT_IN_MEMCMP);
  TREE_OPERAND (exp, 1) = build_fold_addr_expr (newdecl);
}
   break;
diff --git a/gcc/builtins.def b/gcc/builtins.def
index 005976f34e9..95642c6acdf 100644
--- a/gcc/builtins.def
+++ b/gcc/builtins.def
@@ -965,6 +965,9 @@ DEF_BUILTIN_STUB (BUILT_IN_ALLOCA_WITH_ALIGN_AND_MAX, 
"__builtin_alloca_with_ali
equality with zero.  */
 DEF_BUILTIN_STUB (BUILT_IN_MEMCMP_EQ, "__builtin_memcmp_eq")
 
+/* Similar to BUILT_IN_MEMCMP_EQ, but is mapped to __memcmpeq.  */
+DEF_EXT_LIB_BUILTIN (BUILT_IN___MEMCMPEQ, "__memcmpeq", 
BT_FN_INT_CONST_PTR_CONST_PTR_SIZE, ATTR_PURE_NOTHROW_NONNULL_LEAF)
+
 /* An internal version of strcmp/strncmp, used when the result is only 
tested for equality with zero.  */
 DEF_BUILTIN_STUB (BUILT_IN_STRCMP_EQ, "__builtin_strcmp_eq")
diff --git a/gcc/testsuite/c-c++-common/memcmpeq-1.c 
b/gcc/testsuite/c-c++-common/memcmpeq-1.c
new file mode 100644
index 000..14622f0d765
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/memcmpeq-1.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { scan-assembler "__memcmpeq" } } */
+
+#include "memcmpeq.h"
+
+int
+foo (const char *s1, const char *s2, size_t len)
+{
+  return __builtin_memcmp (s1, s2, len) != 0;
+}
diff --git a/gcc/testsuite/c-c++-common/memcmpeq-2.c 
b/gcc/testsuite/c-c++-common/memcmpeq-2.c
new file mode 100644
index 000..f57f279f173
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/memcmpeq-2.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { scan-assembler "__memcmpeq" } } */
+
+#include "memcmpeq.h"
+
+int
+foo (const char *s1, const char *s2, size_t len)
+{
+  return memcmp (s1, s2, len) == 0;
+}
diff --git a/gcc/testsuite/c-c++-common/memcmpeq-3.c 
b/gcc/testsuite/c-c++-common/memcmpeq-3.c
new file mode 100644
index 000..2ca2131c23a
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/memcmpeq-3.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { scan-assembler-not "__memcmpeq" } } */
+
+#include "memcmpeq.h"
+
+int
+foo (const 

[PATCH] x86: Support 2/4/8 byte constant vector stores

2022-06-30 Thread H.J. Lu via Gcc-patches
1. Add a predicate for constant vectors which can be converted to integer
constants suitable for constant integer stores.  For a 8-byte constant
vector, the converted 64-bit integer must be valid for store with 64-bit
immediate, which is a 64-bit integer sign-extended from a 32-bit integer.
2. Add a new pattern to allow 2-byte, 4-byte and 8-byte constant vector
stores, like

(set (mem:V2HI (reg:DI 84))
 (const_vector:V2HI [(const_int 0 [0]) (const_int 1 [0x1])]))

3. After reload, convert constant vector stores to constant integer
stores, like

(set (mem:SI (reg:DI 5 di [84]))
 (const_int 65536 [0x1]))

For

void
foo (short * c)
{
  c[0] = 0;
  c[1] = 1;
}

it generates

movl$65536, (%rdi)

instead of

movl.LC0(%rip), %eax
movl%eax, (%rdi)

gcc/

PR target/106022
* config/i386/i386-protos.h (ix86_convert_const_vector_to_integer):
New.
* config/i386/i386.cc (ix86_convert_const_vector_to_integer):
New.
* config/i386/mmx.md (V_16_32_64): New.
(*mov_imm): New patterns for stores with 16-bit, 32-bit
and 64-bit constant vector.
* config/i386/predicates.md (x86_64_const_vector_operand): New.

gcc/testsuite/

PR target/106022
* gcc.target/i386/pr106022-1.c: New test.
* gcc.target/i386/pr106022-2.c: Likewise.
* gcc.target/i386/pr106022-3.c: Likewise.
* gcc.target/i386/pr106022-4.c: Likewise.
---
 gcc/config/i386/i386-protos.h  |  2 +
 gcc/config/i386/i386.cc| 47 ++
 gcc/config/i386/mmx.md | 37 +
 gcc/config/i386/predicates.md  | 11 +
 gcc/testsuite/gcc.target/i386/pr106022-1.c | 13 ++
 gcc/testsuite/gcc.target/i386/pr106022-2.c | 14 +++
 gcc/testsuite/gcc.target/i386/pr106022-3.c | 14 +++
 gcc/testsuite/gcc.target/i386/pr106022-4.c | 14 +++
 8 files changed, 152 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106022-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106022-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106022-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106022-4.c

diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 3596ce81ecf..cf847751ac5 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -122,6 +122,8 @@ extern void ix86_expand_unary_operator (enum rtx_code, 
machine_mode,
rtx[]);
 extern rtx ix86_build_const_vector (machine_mode, bool, rtx);
 extern rtx ix86_build_signbit_mask (machine_mode, bool, bool);
+extern HOST_WIDE_INT ix86_convert_const_vector_to_integer (rtx,
+  machine_mode);
 extern void ix86_split_convert_uns_si_sse (rtx[]);
 extern void ix86_expand_convert_uns_didf_sse (rtx, rtx);
 extern void ix86_expand_convert_uns_sixf_sse (rtx, rtx);
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index b15b4893bb9..0cfe9962f75 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -15723,6 +15723,53 @@ ix86_build_signbit_mask (machine_mode mode, bool vect, 
bool invert)
   return force_reg (vec_mode, v);
 }
 
+/* Return HOST_WIDE_INT for const vector OP in MODE.  */
+
+HOST_WIDE_INT
+ix86_convert_const_vector_to_integer (rtx op, machine_mode mode)
+{
+  if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
+gcc_unreachable ();
+
+  int nunits = GET_MODE_NUNITS (mode);
+  wide_int val = wi::zero (GET_MODE_BITSIZE (mode));
+  machine_mode innermode = GET_MODE_INNER (mode);
+  unsigned int innermode_bits = GET_MODE_BITSIZE (innermode);
+
+  switch (mode)
+{
+case E_V2QImode:
+case E_V4QImode:
+case E_V2HImode:
+case E_V8QImode:
+case E_V4HImode:
+case E_V2SImode:
+  for (int i = 0; i < nunits; ++i)
+   {
+ int v = INTVAL (XVECEXP (op, 0, i));
+ wide_int wv = wi::shwi (v, innermode_bits);
+ val = wi::insert (val, wv, innermode_bits * i, innermode_bits);
+   }
+  break;
+case E_V2HFmode:
+case E_V4HFmode:
+case E_V2SFmode:
+  for (int i = 0; i < nunits; ++i)
+   {
+ rtx x = XVECEXP (op, 0, i);
+ int v = real_to_target (NULL, CONST_DOUBLE_REAL_VALUE (x),
+ REAL_MODE_FORMAT (innermode));
+ wide_int wv = wi::shwi (v, innermode_bits);
+ val = wi::insert (val, wv, innermode_bits * i, innermode_bits);
+   }
+  break;
+default:
+  gcc_unreachable ();
+}
+
+  return val.to_shwi ();
+}
+
 /* Return TRUE or FALSE depending on whether the first SET in INSN
has source and destination with matching CC modes, and that the
CC mode is at least as constrained as REQ_MODE.  */
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index ba53007a35e..3294c1e6274 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -69,6 +69,12 

Re: PING^1 [PATCH] x86: Skip ENDBR when emitting direct call/jmp to local function

2022-06-27 Thread H.J. Lu via Gcc-patches
On Sun, Jun 26, 2022 at 10:50 PM Hongtao Liu  wrote:
>
> On Tue, Jun 21, 2022 at 3:50 AM Uros Bizjak via Gcc-patches
>  wrote:
> >
> > On Mon, Jun 20, 2022 at 8:14 PM H.J. Lu  wrote:
> > >
> > > On Tue, May 10, 2022 at 9:25 AM H.J. Lu  wrote:
> > > >
> > > > Mark a function with SYMBOL_FLAG_FUNCTION_ENDBR when inserting ENDBR at
> > > > function entry.  Skip the 4-byte ENDBR when emitting a direct call/jmp
> > > > to a local function with ENDBR at function entry.
> > > >
> > > > This has been tested on Linux kernel.
> > > >
> > > > gcc/
> > > >
> > > > PR target/102953
> > > > * config/i386/i386-features.cc
> > > > (rest_of_insert_endbr_and_patchable_area): Set
> > > > SYMBOL_FLAG_FUNCTION_ENDBR when inserting ENDBR.
> > > > * config/i386/i386.cc (ix86_print_operand): Skip the 4-byte 
> > > > ENDBR
> > > > when calling the local function with ENDBR at function entry.
> > > > * config/i386/i386.h (SYMBOL_FLAG_FUNCTION_ENDBR): New.
> > > > (SYMBOL_FLAG_FUNCTION_ENDBR_P): Likewise.
> > > >
> > > > gcc/testsuite/
> > > >
> > > > PR target/102953
> > > > * gcc.target/i386/pr102953-1.c: New test.
> > > > * gcc.target/i386/pr102953-2.c: Likewise.
> The patch looks good to me.
> For direct call, endbr64 should not be used as a marker, right?

Correct.

> > > > ---
> > > >  gcc/config/i386/i386-features.cc   |  2 ++
> > > >  gcc/config/i386/i386.cc| 11 +++-
> > > >  gcc/config/i386/i386.h |  5 
> > > >  gcc/testsuite/gcc.target/i386/pr102953-1.c | 25 ++
> > > >  gcc/testsuite/gcc.target/i386/pr102953-2.c | 30 ++
> > > >  5 files changed, 72 insertions(+), 1 deletion(-)
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr102953-1.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr102953-2.c
> > > >
> > > > diff --git a/gcc/config/i386/i386-features.cc 
> > > > b/gcc/config/i386/i386-features.cc
> > > > index 6fe41c3c24f..3ca1131ed59 100644
> > > > --- a/gcc/config/i386/i386-features.cc
> > > > +++ b/gcc/config/i386/i386-features.cc
> > > > @@ -1979,6 +1979,8 @@ rest_of_insert_endbr_and_patchable_area (bool 
> > > > need_endbr,
> > > >   || (TARGET_DLLIMPORT_DECL_ATTRIBUTES
> > > >   && DECL_DLLIMPORT_P (cfun->decl
> > > > {
> > > > + rtx symbol = XEXP (DECL_RTL (cfun->decl), 0);
> > > > + SYMBOL_REF_FLAGS (symbol) |= SYMBOL_FLAG_FUNCTION_ENDBR;
> > > >   if (crtl->profile && flag_fentry)
> > > > {
> > > >   /* Queue ENDBR insertion to x86_function_profiler.
> > > > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> > > > index 86752a6516a..ad1de239bef 100644
> > > > --- a/gcc/config/i386/i386.cc
> > > > +++ b/gcc/config/i386/i386.cc
> > > > @@ -13787,7 +13787,16 @@ ix86_print_operand (FILE *file, rtx x, int 
> > > > code)
> > > >else if (flag_pic || MACHOPIC_INDIRECT)
> > > > output_pic_addr_const (file, x, code);
> > > >else
> > > > -   output_addr_const (file, x);
> > > > +   {
> > > > + /* Skip ENDBR when emitting a direct call/jmp to a local
> > > > +function with ENDBR at function entry.  */
> > > > + if (code == 'P'
> > > > + && GET_CODE (x) == SYMBOL_REF
> > > > + && SYMBOL_REF_LOCAL_P (x)
> > > > + && SYMBOL_FLAG_FUNCTION_ENDBR_P (x))
> > > > +   x = gen_rtx_PLUS (Pmode, x, GEN_INT (4));
> > > > + output_addr_const (file, x);
> > > > +   }
> > > >  }
> > > >  }
> > > >
> > > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> > > > index 363082ba47b..7a6317fea57 100644
> > > > --- a/gcc/config/i386/i386.h
> > > > +++ b/gcc/config/i386/i386.h
> > > > @@ -2792,6 +2792,11 @@ extern GTY(()) tree ms_va_list_type_node;
> > > >  #define SYMBOL_REF_STUBVAR_P(X) \
> > > > ((SYMBOL_REF_FLAGS (X) & SYMBOL_FLAG_STUBVAR) != 0)
> > > >
> > > > +/* Flag to mark a function with ENDBR at entry.  */
> > > > +#define SYMBOL_FLAG_FUNCTION_ENDBR (SYMBOL_FLAG_MACH_DEP << 5)
> > > > +#define SYMBOL_FLAG_FUNCTION_ENDBR_P(X) \
> > > > +   ((SYMBOL_REF_FLAGS (X) & SYMBOL_FLAG_FUNCTION_ENDBR) != 0)
> > > > +
> > > >  extern void debug_ready_dispatch (void);
> > > >  extern void debug_dispatch_window (int);
> > > >
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr102953-1.c 
> > > > b/gcc/testsuite/gcc.target/i386/pr102953-1.c
> > > > new file mode 100644
> > > > index 000..2afad391baf
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr102953-1.c
> > > > @@ -0,0 +1,25 @@
> > > > +/* { dg-do compile { target { ! *-*-darwin* } } } */
> > > > +/* { dg-options "-O2 -fno-pic -fplt -fcf-protection" } */
> > > > +
> > > > +extern int func (int);
> > > > +
> > > > +extern int i;
> > > > +
> > > > +__attribute__ ((noclone, noinline, noipa))
> > > > +static int
> > > > +bar 

Re: [PATCH] Inline memchr with a small constant string

2022-06-23 Thread H.J. Lu via Gcc-patches
On Wed, Jun 22, 2022 at 11:03 PM Richard Biener
 wrote:
>
> On Wed, Jun 22, 2022 at 7:13 PM H.J. Lu  wrote:
> >
> > On Wed, Jun 22, 2022 at 4:39 AM Richard Biener
> >  wrote:
> > >
> > > On Tue, Jun 21, 2022 at 11:03 PM H.J. Lu via Gcc-patches
> > >  wrote:
> > > >
> > > > When memchr is applied on a constant string of no more than the bytes of
> > > > a word, inline memchr by checking each byte in the constant string.
> > > >
> > > > int f (int a)
> > > > {
> > > >return  __builtin_memchr ("eE", a, 2) != 0;
> > > > }
> > > >
> > > > is simplified to
> > > >
> > > > int f (int a)
> > > > {
> > > >   return (char) a == 'e' || (char) a == 'E';
> > > > }
> > > >
> > > > gcc/
> > > >
> > > > PR tree-optimization/103798
> > > > * match.pd (__builtin_memchr (const_str, a, N)): Inline memchr
> > > > with constant strings of no more than the bytes of a word.
> > >
> > > Please do this in strlenopt or so, with match.pd you will end up moving
> > > the memchr loads across possible aliasing stores to the point of the
> > > comparison.
> >
> > strlenopt is run after many other passes.  The code won't be well optimized.
>
> What followup optimizations do you expect?  That is, other builtins are only

reassociation and dce turn

  _5 = a_2(D) == 101;
  _6 = a_2(D) == 69;
  _1 = _5 | _6;
  _4 = (int) _1;

into

  _7 = a_2(D) & -33;
  _8 = _7 == 69;
  _1 = _8;
  _4 = (int) _1;

> expanded inline at RTL expansion time?

Some high level optimizations will be missed and
TARGET_GIMPLE_FOLD_BUILTIN improves builtins
codegen.

> > Since we are only optimizing
> >
> > __builtin_memchr ("eE", a, 2) != 0;
> >
> > I don't see any aliasing store issues here.
>
> Ah, I failed to see the STRING_CST restriction.  Note that when optimizing for
> size this doesn't look very good.

True.

> I would expect a target might produce some vector code for
> memchr ("aAbBcCdDeE...", c, 9) != 0 by splatting 'c', doing
> a v16qimode compare, masking off excess elements beyond length
> and then comparing against zero or for == 0 against all-ones.
>
> The repetitive pattern result also suggests an implementation elsewhere,
> if you think strlenopt is too late there would be forwprop as well.

forwprop seems a good place.

Thanks.

> Richard.
>
>
>
> > > Richard.
> > >
> > > > gcc/testsuite/
> > > >
> > > > PR tree-optimization/103798
> > > > * c-c++-common/pr103798-1.c: New test.
> > > > * c-c++-common/pr103798-2.c: Likewise.
> > > > * c-c++-common/pr103798-3.c: Likewise.
> > > > * c-c++-common/pr103798-4.c: Likewise.
> > > > * c-c++-common/pr103798-5.c: Likewise.
> > > > * c-c++-common/pr103798-6.c: Likewise.
> > > > * c-c++-common/pr103798-7.c: Likewise.
> > > > * c-c++-common/pr103798-8.c: Likewise.
> > > > ---
> > > >  gcc/match.pd| 136 
> > > >  gcc/testsuite/c-c++-common/pr103798-1.c |  28 +
> > > >  gcc/testsuite/c-c++-common/pr103798-2.c |  30 ++
> > > >  gcc/testsuite/c-c++-common/pr103798-3.c |  28 +
> > > >  gcc/testsuite/c-c++-common/pr103798-4.c |  28 +
> > > >  gcc/testsuite/c-c++-common/pr103798-5.c |  26 +
> > > >  gcc/testsuite/c-c++-common/pr103798-6.c |  27 +
> > > >  gcc/testsuite/c-c++-common/pr103798-7.c |  27 +
> > > >  gcc/testsuite/c-c++-common/pr103798-8.c |  27 +
> > > >  9 files changed, 357 insertions(+)
> > > >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-1.c
> > > >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-2.c
> > > >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-3.c
> > > >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-4.c
> > > >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-5.c
> > > >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-6.c
> > > >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-7.c
> > > >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-8.c
> > > >
> > > > diff --git a/gcc/match.pd b/gcc/match.pd
> > > > index a63b649841b..aa4766749af 100644
> > > > --- a/g

Re: [PATCH] Inline memchr with a small constant string

2022-06-22 Thread H.J. Lu via Gcc-patches
On Wed, Jun 22, 2022 at 4:39 AM Richard Biener
 wrote:
>
> On Tue, Jun 21, 2022 at 11:03 PM H.J. Lu via Gcc-patches
>  wrote:
> >
> > When memchr is applied on a constant string of no more than the bytes of
> > a word, inline memchr by checking each byte in the constant string.
> >
> > int f (int a)
> > {
> >return  __builtin_memchr ("eE", a, 2) != 0;
> > }
> >
> > is simplified to
> >
> > int f (int a)
> > {
> >   return (char) a == 'e' || (char) a == 'E';
> > }
> >
> > gcc/
> >
> > PR tree-optimization/103798
> > * match.pd (__builtin_memchr (const_str, a, N)): Inline memchr
> > with constant strings of no more than the bytes of a word.
>
> Please do this in strlenopt or so, with match.pd you will end up moving
> the memchr loads across possible aliasing stores to the point of the
> comparison.

strlenopt is run after many other passes.  The code won't be well optimized.
Since we are only optimizing

__builtin_memchr ("eE", a, 2) != 0;

I don't see any aliasing store issues here.

> Richard.
>
> > gcc/testsuite/
> >
> > PR tree-optimization/103798
> > * c-c++-common/pr103798-1.c: New test.
> > * c-c++-common/pr103798-2.c: Likewise.
> > * c-c++-common/pr103798-3.c: Likewise.
> > * c-c++-common/pr103798-4.c: Likewise.
> > * c-c++-common/pr103798-5.c: Likewise.
> > * c-c++-common/pr103798-6.c: Likewise.
> > * c-c++-common/pr103798-7.c: Likewise.
> > * c-c++-common/pr103798-8.c: Likewise.
> > ---
> >  gcc/match.pd| 136 
> >  gcc/testsuite/c-c++-common/pr103798-1.c |  28 +
> >  gcc/testsuite/c-c++-common/pr103798-2.c |  30 ++
> >  gcc/testsuite/c-c++-common/pr103798-3.c |  28 +
> >  gcc/testsuite/c-c++-common/pr103798-4.c |  28 +
> >  gcc/testsuite/c-c++-common/pr103798-5.c |  26 +
> >  gcc/testsuite/c-c++-common/pr103798-6.c |  27 +
> >  gcc/testsuite/c-c++-common/pr103798-7.c |  27 +
> >  gcc/testsuite/c-c++-common/pr103798-8.c |  27 +
> >  9 files changed, 357 insertions(+)
> >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-1.c
> >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-2.c
> >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-3.c
> >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-4.c
> >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-5.c
> >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-6.c
> >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-7.c
> >  create mode 100644 gcc/testsuite/c-c++-common/pr103798-8.c
> >
> > diff --git a/gcc/match.pd b/gcc/match.pd
> > index a63b649841b..aa4766749af 100644
> > --- a/gcc/match.pd
> > +++ b/gcc/match.pd
> > @@ -7976,3 +7976,139 @@ and,
> >  (match (bitwise_induction_p @0 @2 @3)
> >   (bit_not
> >(nop_convert1? (bit_xor@0 (convert2? (lshift integer_onep@1 @2)) @3
> > +
> > +#if GIMPLE
> > +/* __builtin_memchr (const_str, a, N) != 0 ->
> > +   a == const_str[0] .. || a == const_str[N-1]
> > +   __builtin_memchr (const_str, a, N) == 0 ->
> > +   a != const_str[0] .. && a != const_str[N-1]
> > +   where N is less than the string size.  */
> > +(for cmp (eq ne)
> > + icmp (ne eq)
> > + bit_op (bit_and bit_ior)
> > + (simplify (cmp:c @0 (BUILT_IN_MEMCHR ADDR_EXPR@1 @2 INTEGER_CST@3))
> > +  (if (UNITS_PER_WORD <= 8
> > +   && CHAR_TYPE_SIZE == 8
> > +   && BITS_PER_UNIT == 8
> > +   && CHAR_BIT == 8
> > +   && integer_zerop (@0)
> > +   && !integer_zerop (@3)
> > +   && TREE_CODE (TREE_OPERAND (@1, 0)) == STRING_CST
> > +   && TREE_STRING_LENGTH (TREE_OPERAND (@1, 0)) >= 2
> > +   && wi::leu_p (wi::to_wide (@3), UNITS_PER_WORD)
> > +   && wi::ltu_p (wi::to_wide (@3),
> > +TREE_STRING_LENGTH (TREE_OPERAND (@1, 0
> > +   (with
> > +{
> > +  const char *p = TREE_STRING_POINTER (TREE_OPERAND (@1, 0));
> > +  unsigned HOST_WIDE_INT size = TREE_INT_CST_LOW (@3);
> > +}
> > +(switch
> > + (if (size == 1)
> > +  (icmp (convert:char_type_node @2)
> > +   { build_int_cst (char_type_node, p[0]); }))
> > + (if (size == 2)
> > +  (bit_op
> > +   (icmp (convert:char_type_node @2)
> > +{ build_int_cst (cha

[PATCH] Inline memchr with a small constant string

2022-06-21 Thread H.J. Lu via Gcc-patches
When memchr is applied on a constant string of no more than the bytes of
a word, inline memchr by checking each byte in the constant string.

int f (int a)
{
   return  __builtin_memchr ("eE", a, 2) != 0;
}

is simplified to

int f (int a)
{
  return (char) a == 'e' || (char) a == 'E';
}

gcc/

PR tree-optimization/103798
* match.pd (__builtin_memchr (const_str, a, N)): Inline memchr
with constant strings of no more than the bytes of a word.

gcc/testsuite/

PR tree-optimization/103798
* c-c++-common/pr103798-1.c: New test.
* c-c++-common/pr103798-2.c: Likewise.
* c-c++-common/pr103798-3.c: Likewise.
* c-c++-common/pr103798-4.c: Likewise.
* c-c++-common/pr103798-5.c: Likewise.
* c-c++-common/pr103798-6.c: Likewise.
* c-c++-common/pr103798-7.c: Likewise.
* c-c++-common/pr103798-8.c: Likewise.
---
 gcc/match.pd| 136 
 gcc/testsuite/c-c++-common/pr103798-1.c |  28 +
 gcc/testsuite/c-c++-common/pr103798-2.c |  30 ++
 gcc/testsuite/c-c++-common/pr103798-3.c |  28 +
 gcc/testsuite/c-c++-common/pr103798-4.c |  28 +
 gcc/testsuite/c-c++-common/pr103798-5.c |  26 +
 gcc/testsuite/c-c++-common/pr103798-6.c |  27 +
 gcc/testsuite/c-c++-common/pr103798-7.c |  27 +
 gcc/testsuite/c-c++-common/pr103798-8.c |  27 +
 9 files changed, 357 insertions(+)
 create mode 100644 gcc/testsuite/c-c++-common/pr103798-1.c
 create mode 100644 gcc/testsuite/c-c++-common/pr103798-2.c
 create mode 100644 gcc/testsuite/c-c++-common/pr103798-3.c
 create mode 100644 gcc/testsuite/c-c++-common/pr103798-4.c
 create mode 100644 gcc/testsuite/c-c++-common/pr103798-5.c
 create mode 100644 gcc/testsuite/c-c++-common/pr103798-6.c
 create mode 100644 gcc/testsuite/c-c++-common/pr103798-7.c
 create mode 100644 gcc/testsuite/c-c++-common/pr103798-8.c

diff --git a/gcc/match.pd b/gcc/match.pd
index a63b649841b..aa4766749af 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -7976,3 +7976,139 @@ and,
 (match (bitwise_induction_p @0 @2 @3)
  (bit_not
   (nop_convert1? (bit_xor@0 (convert2? (lshift integer_onep@1 @2)) @3
+
+#if GIMPLE
+/* __builtin_memchr (const_str, a, N) != 0 ->
+   a == const_str[0] .. || a == const_str[N-1]
+   __builtin_memchr (const_str, a, N) == 0 ->
+   a != const_str[0] .. && a != const_str[N-1]
+   where N is less than the string size.  */
+(for cmp (eq ne)
+ icmp (ne eq)
+ bit_op (bit_and bit_ior)
+ (simplify (cmp:c @0 (BUILT_IN_MEMCHR ADDR_EXPR@1 @2 INTEGER_CST@3))
+  (if (UNITS_PER_WORD <= 8
+   && CHAR_TYPE_SIZE == 8
+   && BITS_PER_UNIT == 8
+   && CHAR_BIT == 8
+   && integer_zerop (@0)
+   && !integer_zerop (@3)
+   && TREE_CODE (TREE_OPERAND (@1, 0)) == STRING_CST
+   && TREE_STRING_LENGTH (TREE_OPERAND (@1, 0)) >= 2
+   && wi::leu_p (wi::to_wide (@3), UNITS_PER_WORD)
+   && wi::ltu_p (wi::to_wide (@3),
+TREE_STRING_LENGTH (TREE_OPERAND (@1, 0
+   (with
+{
+  const char *p = TREE_STRING_POINTER (TREE_OPERAND (@1, 0));
+  unsigned HOST_WIDE_INT size = TREE_INT_CST_LOW (@3);
+}
+(switch
+ (if (size == 1)
+  (icmp (convert:char_type_node @2)
+   { build_int_cst (char_type_node, p[0]); }))
+ (if (size == 2)
+  (bit_op
+   (icmp (convert:char_type_node @2)
+{ build_int_cst (char_type_node, p[0]); })
+   (icmp (convert:char_type_node @2)
+{ build_int_cst (char_type_node, p[1]); })))
+ (if (size == 3)
+  (bit_op
+   (icmp (convert:char_type_node @2)
+{ build_int_cst (char_type_node, p[0]); })
+   (bit_op
+(icmp (convert:char_type_node @2)
+ { build_int_cst (char_type_node, p[1]); })
+(icmp (convert:char_type_node @2)
+ { build_int_cst (char_type_node, p[2]); }
+ (if (size == 4)
+  (bit_op
+   (icmp (convert:char_type_node @2)
+{ build_int_cst (char_type_node, p[0]); })
+   (bit_op
+   (icmp (convert:char_type_node @2)
+{ build_int_cst (char_type_node, p[1]); })
+   (bit_op
+(icmp (convert:char_type_node @2)
+  { build_int_cst (char_type_node, p[2]); })
+(icmp (convert:char_type_node @2)
+  { build_int_cst (char_type_node, p[3]); })
+ (if (size == 5)
+  (bit_op
+   (icmp (convert:char_type_node @2)
+{ build_int_cst (char_type_node, p[0]); })
+   (bit_op
+   (icmp (convert:char_type_node @2)
+ { build_int_cst (char_type_node, p[1]); })
+   (bit_op
+(icmp (convert:char_type_node @2)
+  { build_int_cst (char_type_node, p[2]); })
+(bit_op
+ (icmp (convert:char_type_node @2)
+   { build_int_cst (char_type_node, p[3]); })
+ (icmp (convert:char_type_node @2)
+   { build_int_cst (char_type_node, p[4]); }))
+ (if (size == 6)
+  

Re: PING^1 [PATCH] i386: Disallow sibcall when calling ifunc functions with PIC register

2022-06-21 Thread H.J. Lu via Gcc-patches
On Mon, Jun 20, 2022 at 7:51 AM Uros Bizjak  wrote:
>
> On Mon, Jun 20, 2022 at 4:03 PM H.J. Lu  wrote:
> >
> > On Tue, Jun 14, 2022 at 12:25 PM H.J. Lu  wrote:
> > >
> > > Disallow siball when calling ifunc functions with PIC register so that
> > > PIC register can be restored.
> > >
> > > gcc/
> > >
> > > PR target/105960
> > > * config/i386/i386.cc (ix86_function_ok_for_sibcall): Return
> > > false if PIC register is used when calling ifunc functions.
> > >
> > > gcc/testsuite/
> > >
> > > PR target/105960
> > > * gcc.target/i386/pr105960.c: New test.
>
> LGTM.

OK to backport to GCC 12 branch?

Thanks.

> Thanks,
> Uros.
>
> > > ---
> > >  gcc/config/i386/i386.cc  |  9 +
> > >  gcc/testsuite/gcc.target/i386/pr105960.c | 19 +++
> > >  2 files changed, 28 insertions(+)
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr105960.c
> > >
> > > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> > > index 3d189e124e4..1ca7836e11e 100644
> > > --- a/gcc/config/i386/i386.cc
> > > +++ b/gcc/config/i386/i386.cc
> > > @@ -1015,6 +1015,15 @@ ix86_function_ok_for_sibcall (tree decl, tree exp)
> > > }
> > >  }
> > >
> > > +  if (decl && ix86_use_pseudo_pic_reg ())
> > > +{
> > > +  /* When PIC register is used, it must be restored after ifunc
> > > +function returns.  */
> > > +   cgraph_node *node = cgraph_node::get (decl);
> > > +   if (node && node->ifunc_resolver)
> > > +return false;
> > > +}
> > > +
> > >/* Otherwise okay.  That also includes certain types of indirect 
> > > calls.  */
> > >return true;
> > >  }
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr105960.c 
> > > b/gcc/testsuite/gcc.target/i386/pr105960.c
> > > new file mode 100644
> > > index 000..db137a1642d
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr105960.c
> > > @@ -0,0 +1,19 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-require-ifunc "" } */
> > > +/* { dg-options "-O2 -fpic" } */
> > > +
> > > +__attribute__((target_clones("default","fma")))
> > > +static inline double
> > > +expfull_ref(double x)
> > > +{
> > > +  return __builtin_pow(x, 0.1234);
> > > +}
> > > +
> > > +double
> > > +exp_ref(double x)
> > > +{
> > > +  return expfull_ref(x);
> > > +}
> > > +
> > > +/* { dg-final { scan-assembler "jmp\[ \t\]*expfull_ref@PLT" { target { ! 
> > > ia32 } } } } */
> > > +/* { dg-final { scan-assembler "call\[ \t\]*expfull_ref@PLT" { target 
> > > ia32 } } } */
> > > --
> > > 2.36.1
> > >
> >
> > PING.
> >
> > --
> > H.J.



-- 
H.J.


PING^1 [PATCH] x86: Skip ENDBR when emitting direct call/jmp to local function

2022-06-20 Thread H.J. Lu via Gcc-patches
On Tue, May 10, 2022 at 9:25 AM H.J. Lu  wrote:
>
> Mark a function with SYMBOL_FLAG_FUNCTION_ENDBR when inserting ENDBR at
> function entry.  Skip the 4-byte ENDBR when emitting a direct call/jmp
> to a local function with ENDBR at function entry.
>
> This has been tested on Linux kernel.
>
> gcc/
>
> PR target/102953
> * config/i386/i386-features.cc
> (rest_of_insert_endbr_and_patchable_area): Set
> SYMBOL_FLAG_FUNCTION_ENDBR when inserting ENDBR.
> * config/i386/i386.cc (ix86_print_operand): Skip the 4-byte ENDBR
> when calling the local function with ENDBR at function entry.
> * config/i386/i386.h (SYMBOL_FLAG_FUNCTION_ENDBR): New.
> (SYMBOL_FLAG_FUNCTION_ENDBR_P): Likewise.
>
> gcc/testsuite/
>
> PR target/102953
> * gcc.target/i386/pr102953-1.c: New test.
> * gcc.target/i386/pr102953-2.c: Likewise.
> ---
>  gcc/config/i386/i386-features.cc   |  2 ++
>  gcc/config/i386/i386.cc| 11 +++-
>  gcc/config/i386/i386.h |  5 
>  gcc/testsuite/gcc.target/i386/pr102953-1.c | 25 ++
>  gcc/testsuite/gcc.target/i386/pr102953-2.c | 30 ++
>  5 files changed, 72 insertions(+), 1 deletion(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr102953-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr102953-2.c
>
> diff --git a/gcc/config/i386/i386-features.cc 
> b/gcc/config/i386/i386-features.cc
> index 6fe41c3c24f..3ca1131ed59 100644
> --- a/gcc/config/i386/i386-features.cc
> +++ b/gcc/config/i386/i386-features.cc
> @@ -1979,6 +1979,8 @@ rest_of_insert_endbr_and_patchable_area (bool 
> need_endbr,
>   || (TARGET_DLLIMPORT_DECL_ATTRIBUTES
>   && DECL_DLLIMPORT_P (cfun->decl
> {
> + rtx symbol = XEXP (DECL_RTL (cfun->decl), 0);
> + SYMBOL_REF_FLAGS (symbol) |= SYMBOL_FLAG_FUNCTION_ENDBR;
>   if (crtl->profile && flag_fentry)
> {
>   /* Queue ENDBR insertion to x86_function_profiler.
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index 86752a6516a..ad1de239bef 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -13787,7 +13787,16 @@ ix86_print_operand (FILE *file, rtx x, int code)
>else if (flag_pic || MACHOPIC_INDIRECT)
> output_pic_addr_const (file, x, code);
>else
> -   output_addr_const (file, x);
> +   {
> + /* Skip ENDBR when emitting a direct call/jmp to a local
> +function with ENDBR at function entry.  */
> + if (code == 'P'
> + && GET_CODE (x) == SYMBOL_REF
> + && SYMBOL_REF_LOCAL_P (x)
> + && SYMBOL_FLAG_FUNCTION_ENDBR_P (x))
> +   x = gen_rtx_PLUS (Pmode, x, GEN_INT (4));
> + output_addr_const (file, x);
> +   }
>  }
>  }
>
> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> index 363082ba47b..7a6317fea57 100644
> --- a/gcc/config/i386/i386.h
> +++ b/gcc/config/i386/i386.h
> @@ -2792,6 +2792,11 @@ extern GTY(()) tree ms_va_list_type_node;
>  #define SYMBOL_REF_STUBVAR_P(X) \
> ((SYMBOL_REF_FLAGS (X) & SYMBOL_FLAG_STUBVAR) != 0)
>
> +/* Flag to mark a function with ENDBR at entry.  */
> +#define SYMBOL_FLAG_FUNCTION_ENDBR (SYMBOL_FLAG_MACH_DEP << 5)
> +#define SYMBOL_FLAG_FUNCTION_ENDBR_P(X) \
> +   ((SYMBOL_REF_FLAGS (X) & SYMBOL_FLAG_FUNCTION_ENDBR) != 0)
> +
>  extern void debug_ready_dispatch (void);
>  extern void debug_dispatch_window (int);
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr102953-1.c 
> b/gcc/testsuite/gcc.target/i386/pr102953-1.c
> new file mode 100644
> index 000..2afad391baf
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr102953-1.c
> @@ -0,0 +1,25 @@
> +/* { dg-do compile { target { ! *-*-darwin* } } } */
> +/* { dg-options "-O2 -fno-pic -fplt -fcf-protection" } */
> +
> +extern int func (int);
> +
> +extern int i;
> +
> +__attribute__ ((noclone, noinline, noipa))
> +static int
> +bar (int x)
> +{
> +  if (x == 0)
> +return x;
> +  return bar (x - 1) + func (x);
> +}
> +
> +void *
> +foo (void)
> +{
> +  i = bar (2);
> +  return bar;
> +}
> +
> +/* { dg-final { scan-assembler-times {call\t_?bar\+4\M} 2 } } */
> +/* { dg-final { scan-assembler-times {call\t_?func\M} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr102953-2.c 
> b/gcc/testsuite/gcc.target/i386/pr102953-2.c
> new file mode 100644
> index 000..5b8d517f4f2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr102953-2.c
> @@ -0,0 +1,30 @@
> +/* { dg-do compile { target { ! *-*-darwin* } } } */
> +/* { dg-options "-O2 -fno-pic -fplt -fcf-protection" } */
> +
> +static int bar (int x);
> +extern int func (int);
> +
> +int
> +foo (int i)
> +{
> +  return bar (i);
> +}
> +
> +void *
> +bar_p (void)
> +{
> +  return bar;
> +}
> +
> +__attribute__ ((noclone, noinline, noipa))
> +static int
> +bar (int x)
> +{
> +  if (x == 0)
> 

Re: [PATCH v1] tree-optimization/95821 - Convert strlen + strchr to memchr

2022-06-20 Thread H.J. Lu via Gcc-patches
On Mon, Jun 20, 2022 at 10:29 AM Jakub Jelinek  wrote:
>
> On Mon, Jun 20, 2022 at 09:35:36AM -0700, Noah Goldstein via Gcc-patches 
> wrote:
> > This patch allows for strchr(x, c) to the replace with memchr(x, c,
> > strlen(x) + 1) if strlen(x) has already been computed earlier in the
> > tree.
> >
> > Handles PR95821: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95821
> >
> > Since memchr doesn't need to re-find the null terminator it is faster
> > than strchr.
>
> Do you have a GCC Copyright assignment on file, or do you want to submit

Noah works for Intel and he should be covered.

> this under DCO ( https://gcc.gnu.org/dco.html )?  If the latter, there
> should be a Signed-off-by: line, both in the mail and later commit.
> >
> > bootstrapped and tested on x86_64-linux.
> >
> > gcc/
> >
>
> As it fixes a GCC bugzilla bug, the ChangeLog entry should start with
> PR tree-optimization/95821
> line.
> > * tree-ssa-strlen.cc: Emit memchr instead of strchr if strlen
> >  already computed.
>
> All the indented lines in ChangeLog should be indented by tab.
> You are modifying strlen_pass::handle_builtin_strchr function, so after
> tree-ssa-strlen.cc there should be that function name in parens:
> * tree-ssa-strlen.cc (strlen_pass::handle_builtin_strchr): Emit
> memchr ...
>
> >
> > gcc/testsuite/
> >
> > * c-c++-common/pr95821-1.c
> > * c-c++-common/pr95821-2.c
> > * c-c++-common/pr95821-3.c
> > * c-c++-common/pr95821-4.c
> > * c-c++-common/pr95821-5.c
> > * c-c++-common/pr95821-6.c
>
> All the above lines should end with ": New test." after .c
>
> > --- a/gcc/tree-ssa-strlen.cc
> > +++ b/gcc/tree-ssa-strlen.cc
>
> How does the patch relate to the one that H.J. attached in
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95821#c4 ?

Both patches are very similar.  Mine has a bug.

> > @@ -2405,9 +2405,12 @@ strlen_pass::handle_builtin_strlen ()
> >  }
> >  }
> >
> > -/* Handle a strchr call.  If strlen of the first argument is known, replace
> > -   the strchr (x, 0) call with the endptr or x + strlen, otherwise remember
> > -   that lhs of the call is endptr and strlen of the argument is endptr - 
> > x.  */
> > +/* Handle a strchr call.  If strlen of the first argument is known,
> > +   replace the strchr (x, 0) call with the endptr or x + strlen,
> > +   otherwise remember that lhs of the call is endptr and strlen of the
> > +   argument is endptr - x.  If strlen of x is not know but has been
> > +   computed earlier in the tree then replace strchr(x, c) to
> > +   memchr(x, c, strlen + 1).  */
>
> Space before ( even in comments.
>
>
>
> >  void
> >  strlen_pass::handle_builtin_strchr ()
> > @@ -2418,8 +2421,8 @@ strlen_pass::handle_builtin_strchr ()
> >if (lhs == NULL_TREE)
> >  return;
> >
> > -  if (!integer_zerop (gimple_call_arg (stmt, 1)))
> > -return;
> > +  tree chr = gimple_call_arg (stmt, 1);
> > +  bool is_strchr_zerop = integer_zerop (chr);
> >
> >tree src = gimple_call_arg (stmt, 0);
> >
> > @@ -2452,32 +2455,56 @@ strlen_pass::handle_builtin_strchr ()
> > fprintf (dump_file, "Optimizing: ");
> > print_gimple_stmt (dump_file, stmt, 0, TDF_SLIM);
> >   }
> > -   if (si != NULL && si->endptr != NULL_TREE)
> > +   if (!is_strchr_zerop)
> >   {
> > -   rhs = unshare_expr (si->endptr);
> > -   if (!useless_type_conversion_p (TREE_TYPE (lhs),
> > -   TREE_TYPE (rhs)))
> > - rhs = fold_convert_loc (loc, TREE_TYPE (lhs), rhs);
> > +   /* If its not strchr(s, zerop) then try and convert to
> > +memchr if strlen has already been computed.  */
>
> Again, space before (.  The second line is weirdly formatted, should
> be indented below If.
>
> > +   tree fn = builtin_decl_explicit (BUILT_IN_MEMCHR);
> > +   tree one = build_int_cst (TREE_TYPE (rhs), 1);
> > +   rhs = fold_build2_loc (loc, PLUS_EXPR, TREE_TYPE (rhs),
> > +  unshare_expr (rhs), one);
> > +   tree size = make_ssa_name (TREE_TYPE (rhs));
> > +   gassign *size_stmt = gimple_build_assign (size, rhs);
> > +   gsi_insert_before (_gsi, size_stmt, GSI_SAME_STMT);
> > +   rhs = size;
> > +   if (!update_gimple_call (_gsi, fn, 3, src, chr, rhs))
> > + return;
>
> I think we should differentiate more.  If integer_nonzerop (chr)
> or perhaps better tree_expr_nonzero_p (chr), then it is better
> to optimize t = strlen (x); ... p = strchr (x, c); to
> t = strlen (x); ... p = memchr (x, c, t);
> the t + 1 is only needed if c might be zero.
>
> > +   /* Don't update strlen of lhs if search-char was non-zero.  */
>
> Wasn't known to be zero is the right thing.
>
> Jakub
>


-- 
H.J.


Re: [PATCH] Add -fextra-libc-function=memcmpeq for __memcmpeq

2022-06-20 Thread H.J. Lu via Gcc-patches
On Mon, Jun 20, 2022 at 2:39 AM Richard Biener
 wrote:
>
> On Thu, Jun 16, 2022 at 1:38 AM Fangrui Song  wrote:
> >
> > On Wed, Jun 15, 2022 at 2:44 PM H.J. Lu via Gcc-patches
> >  wrote:
> > >
> > > On Mon, Jun 13, 2022 at 9:01 AM Richard Biener
> > >  wrote:
> > > >
> > > >
> > > >
> > > > > Am 13.06.2022 um 16:36 schrieb H.J. Lu :
> > > > >
> > > > > On Mon, Jun 13, 2022 at 3:11 AM Richard Biener
> > > > >  wrote:
> > > > >>
> > > > >>> On Tue, Jun 7, 2022 at 9:02 PM H.J. Lu via Gcc-patches
> > > > >>>  wrote:
> > > > >>>
> > > > >>> Add -fextra-libc-function=memcmpeq to map
> > > > >>>
> > > > >>> extern int __memcmpeq (const void *, const void *, size_t);
> > > > >>>
> > > > >>> which was added to GLIBC 2.35, to __builtin_memcmp_eq.
> > > > >>
> > > > >> Humm.  Can't we instead use the presence of a declaration
> > > > >> of __memcmpeq with a GNU standard dialect as this instead of
> > > > >> adding a weird -fextra-libc-function= option?  Maybe that's even
> > > > >> reasonable with a non-GNU dialect standard in effect since
> > > > >> __ prefixed names are in the implementation namespace?
> > > > >
> > > > > But not all source codes include  and GCC may generate
> > > > > memcmp directly.  How should we handle these cases?
> > > >
> > > > Not.  Similar as to vectorized math functions.
> > > > I think it’s not worth optimizing for this case.
> > >
> > > Another question.  Should we consider any __memcmpeq prototype
> > > or just the one in the system header file?
>
> Any.

Here is the v2 patch:

https://gcc.gnu.org/pipermail/gcc-patches/2022-June/596881.html

> > An idea from https://reviews.llvm.org/D56593#3586673: -fbuiltin-__memcmpeq
> >
> > This requires making -fbuiltin-function available, see
> > https://gcc.gnu.org/onlinedocs/gcc/C-Dialect-Options.html
> > ("There is no corresponding -fbuiltin-function option")
> >
> > I prefer an option over a magic behavior about whether a declaration exists.
>
> But we already have this behavior for multiple cases.  It's also the only
> way that in practice __memcmpeq will be used - _nobody_ (but maybe
> special crafted SPEC peak runs) will add explicit -fbuiltin-__memcmpeq.
>
> Richard.

Thanks.

-- 
H.J.


[PATCH v2] Enable __memcmpeq after seeing __memcmpeq prototype

2022-06-20 Thread H.J. Lu via Gcc-patches
extern int __memcmpeq (const void *, const void *, size_t);

was was added to GLIBC 2.35.  Expand BUILT_IN_MEMCMP_EQ to __memcmpeq
after seeing __memcmpeq prototype

gcc/

* builtins.cc (have_memcmpeq_prototype): New.
(expand_builtin): Issue an error for BUILT_IN___MEMCMPEQ if
there is no __memcmpeq prototype.  Expand BUILT_IN_MEMCMP_EQ
to BUILT_IN___MEMCMP_EQ if there is __memcmpeq prototype.
* builtins.def (BUILT_IN___MEMCMPEQ): New.
* builtins.h (have_memcmpeq_prototype): New.

gcc/c/

* c-decl.cc (diagnose_mismatched_decls): Set
have_memcmpeq_prototype to true after seeing __memcmpeq prototype.

gcc/cp/

*  decl.cc (duplicate_decls): Set have_memcmpeq_prototype to true
after seeing __memcmpeq prototype.

gcc/testsuite/

* c-c++-common/memcmpeq-1.c: New test.
* c-c++-common/memcmpeq-2.c: Likewise.
* c-c++-common/memcmpeq-3.c: Likewise.
* c-c++-common/memcmpeq-4.c: Likewise.
* c-c++-common/memcmpeq-5.c: Likewise.
* c-c++-common/memcmpeq-6.c: Likewise.
* c-c++-common/memcmpeq.h: Likewise.
---
 gcc/builtins.cc | 17 -
 gcc/builtins.def|  3 +++
 gcc/builtins.h  |  3 +++
 gcc/c/c-decl.cc | 25 ++---
 gcc/cp/decl.cc  |  5 +
 gcc/testsuite/c-c++-common/memcmpeq-1.c | 11 +++
 gcc/testsuite/c-c++-common/memcmpeq-2.c | 11 +++
 gcc/testsuite/c-c++-common/memcmpeq-3.c | 11 +++
 gcc/testsuite/c-c++-common/memcmpeq-4.c | 11 +++
 gcc/testsuite/c-c++-common/memcmpeq-5.c | 11 +++
 gcc/testsuite/c-c++-common/memcmpeq-6.c | 10 ++
 gcc/testsuite/c-c++-common/memcmpeq.h   | 11 +++
 12 files changed, 121 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-1.c
 create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-2.c
 create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-3.c
 create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-4.c
 create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-5.c
 create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-6.c
 create mode 100644 gcc/testsuite/c-c++-common/memcmpeq.h

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 971b18c3745..96e283e5847 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -104,6 +104,9 @@ builtin_info_type builtin_info[(int)END_BUILTINS];
 /* Non-zero if __builtin_constant_p should be folded right away.  */
 bool force_folding_builtin_constant_p;
 
+/* True if there is a __memcmpeq prototype.  */
+bool have_memcmpeq_prototype;
+
 static int target_char_cast (tree, char *);
 static int apply_args_size (void);
 static int apply_result_size (void);
@@ -7392,6 +7395,15 @@ expand_builtin (tree exp, rtx target, rtx subtarget, 
machine_mode mode,
return target;
   break;
 
+case BUILT_IN___MEMCMPEQ:
+  if (!have_memcmpeq_prototype)
+   {
+ error ("use of %<__builtin___memcmpeq ()%> without "
+"%<__memcmpeq%> prototype");
+ return const0_rtx;
+   }
+  break;
+
 /* Expand it as BUILT_IN_MEMCMP_EQ first. If not successful, change it
back to a BUILT_IN_STRCMP. Remember to delete the 3rd parameter
when changing it to a strcmp call.  */
@@ -7445,7 +7457,10 @@ expand_builtin (tree exp, rtx target, rtx subtarget, 
machine_mode mode,
return target;
   if (fcode == BUILT_IN_MEMCMP_EQ)
{
- tree newdecl = builtin_decl_explicit (BUILT_IN_MEMCMP);
+ tree newdecl = builtin_decl_explicit
+   (have_memcmpeq_prototype
+? BUILT_IN___MEMCMPEQ
+: BUILT_IN_MEMCMP);
  TREE_OPERAND (exp, 1) = build_fold_addr_expr (newdecl);
}
   break;
diff --git a/gcc/builtins.def b/gcc/builtins.def
index 005976f34e9..95642c6acdf 100644
--- a/gcc/builtins.def
+++ b/gcc/builtins.def
@@ -965,6 +965,9 @@ DEF_BUILTIN_STUB (BUILT_IN_ALLOCA_WITH_ALIGN_AND_MAX, 
"__builtin_alloca_with_ali
equality with zero.  */
 DEF_BUILTIN_STUB (BUILT_IN_MEMCMP_EQ, "__builtin_memcmp_eq")
 
+/* Similar to BUILT_IN_MEMCMP_EQ, but is mapped to __memcmpeq.  */
+DEF_EXT_LIB_BUILTIN (BUILT_IN___MEMCMPEQ, "__memcmpeq", 
BT_FN_INT_CONST_PTR_CONST_PTR_SIZE, ATTR_PURE_NOTHROW_NONNULL_LEAF)
+
 /* An internal version of strcmp/strncmp, used when the result is only 
tested for equality with zero.  */
 DEF_BUILTIN_STUB (BUILT_IN_STRCMP_EQ, "__builtin_strcmp_eq")
diff --git a/gcc/builtins.h b/gcc/builtins.h
index 5ad830c9fbf..e3e80b33f6d 100644
--- a/gcc/builtins.h
+++ b/gcc/builtins.h
@@ -49,6 +49,9 @@ extern struct target_builtins *this_target_builtins;
 /* Non-zero if __builtin_constant_p should be folded right away.  */
 extern bool force_folding_builtin_constant_p;
 
+/* True if there is a __memcmpeq prototype.  */
+extern bool have_memcmpeq_prototype;
+
 extern bool 

PING^1 [PATCH] i386: Disallow sibcall when calling ifunc functions with PIC register

2022-06-20 Thread H.J. Lu via Gcc-patches
On Tue, Jun 14, 2022 at 12:25 PM H.J. Lu  wrote:
>
> Disallow siball when calling ifunc functions with PIC register so that
> PIC register can be restored.
>
> gcc/
>
> PR target/105960
> * config/i386/i386.cc (ix86_function_ok_for_sibcall): Return
> false if PIC register is used when calling ifunc functions.
>
> gcc/testsuite/
>
> PR target/105960
> * gcc.target/i386/pr105960.c: New test.
> ---
>  gcc/config/i386/i386.cc  |  9 +
>  gcc/testsuite/gcc.target/i386/pr105960.c | 19 +++
>  2 files changed, 28 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr105960.c
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index 3d189e124e4..1ca7836e11e 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -1015,6 +1015,15 @@ ix86_function_ok_for_sibcall (tree decl, tree exp)
> }
>  }
>
> +  if (decl && ix86_use_pseudo_pic_reg ())
> +{
> +  /* When PIC register is used, it must be restored after ifunc
> +function returns.  */
> +   cgraph_node *node = cgraph_node::get (decl);
> +   if (node && node->ifunc_resolver)
> +return false;
> +}
> +
>/* Otherwise okay.  That also includes certain types of indirect calls.  */
>return true;
>  }
> diff --git a/gcc/testsuite/gcc.target/i386/pr105960.c 
> b/gcc/testsuite/gcc.target/i386/pr105960.c
> new file mode 100644
> index 000..db137a1642d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr105960.c
> @@ -0,0 +1,19 @@
> +/* { dg-do compile } */
> +/* { dg-require-ifunc "" } */
> +/* { dg-options "-O2 -fpic" } */
> +
> +__attribute__((target_clones("default","fma")))
> +static inline double
> +expfull_ref(double x)
> +{
> +  return __builtin_pow(x, 0.1234);
> +}
> +
> +double
> +exp_ref(double x)
> +{
> +  return expfull_ref(x);
> +}
> +
> +/* { dg-final { scan-assembler "jmp\[ \t\]*expfull_ref@PLT" { target { ! 
> ia32 } } } } */
> +/* { dg-final { scan-assembler "call\[ \t\]*expfull_ref@PLT" { target ia32 } 
> } } */
> --
> 2.36.1
>

PING.

-- 
H.J.


Re: [PATCH] Add -fextra-libc-function=memcmpeq for __memcmpeq

2022-06-15 Thread H.J. Lu via Gcc-patches
On Mon, Jun 13, 2022 at 9:01 AM Richard Biener
 wrote:
>
>
>
> > Am 13.06.2022 um 16:36 schrieb H.J. Lu :
> >
> > On Mon, Jun 13, 2022 at 3:11 AM Richard Biener
> >  wrote:
> >>
> >>> On Tue, Jun 7, 2022 at 9:02 PM H.J. Lu via Gcc-patches
> >>>  wrote:
> >>>
> >>> Add -fextra-libc-function=memcmpeq to map
> >>>
> >>> extern int __memcmpeq (const void *, const void *, size_t);
> >>>
> >>> which was added to GLIBC 2.35, to __builtin_memcmp_eq.
> >>
> >> Humm.  Can't we instead use the presence of a declaration
> >> of __memcmpeq with a GNU standard dialect as this instead of
> >> adding a weird -fextra-libc-function= option?  Maybe that's even
> >> reasonable with a non-GNU dialect standard in effect since
> >> __ prefixed names are in the implementation namespace?
> >
> > But not all source codes include  and GCC may generate
> > memcmp directly.  How should we handle these cases?
>
> Not.  Similar as to vectorized math functions.
> I think it’s not worth optimizing for this case.

Another question.  Should we consider any __memcmpeq prototype
or just the one in the system header file?

> Richard.
>
> >
> >> Richard.
> >>
> >>> gcc/
> >>>
> >>>* builtins.cc: Include "opts.h".
> >>>(expand_builtin): Generate BUILT_IN_MEMCMP_EQ if __memcmpeq is
> >>>available.
> >>>* builtins.def (BUILT_IN___MEMCMPEQ): New.
> >>>* common.opt: Add -fextra-libc-function=.
> >>>* opts.cc (extra_libc_functions): New.
> >>>(parse_extra_libc_function): New function.
> >>>(common_handle_option): Handle -fextra-libc-function=.
> >>>* opts.h (extra_libc_function_list): New.
> >>>(extra_libc_functions): Likewise.
> >>>* doc/invoke.texi: Document -fextra-libc-function=memcmpeq.
> >>>
> >>> gcc/testsuite/
> >>>
> >>>* c-c++-common/memcmpeq-1.c: New test.
> >>>* c-c++-common/memcmpeq-2.c: Likewise.
> >>>* c-c++-common/memcmpeq-3.c: Likewise.
> >>>* c-c++-common/memcmpeq-4.c: Likewise.
> >>>* c-c++-common/memcmpeq-5.c: Likewise.
> >>>* c-c++-common/memcmpeq-6.c: Likewise.
> >>>* c-c++-common/memcmpeq-7.c: Likewise.
> >>> ---
> >>> gcc/builtins.cc |  5 -
> >>> gcc/builtins.def|  4 
> >>> gcc/common.opt  |  4 
> >>> gcc/doc/invoke.texi |  6 ++
> >>> gcc/opts.cc | 23 +++
> >>> gcc/opts.h  |  7 +++
> >>> gcc/testsuite/c-c++-common/memcmpeq-1.c | 11 +++
> >>> gcc/testsuite/c-c++-common/memcmpeq-2.c | 11 +++
> >>> gcc/testsuite/c-c++-common/memcmpeq-3.c | 11 +++
> >>> gcc/testsuite/c-c++-common/memcmpeq-4.c | 11 +++
> >>> gcc/testsuite/c-c++-common/memcmpeq-5.c | 11 +++
> >>> gcc/testsuite/c-c++-common/memcmpeq-6.c | 11 +++
> >>> gcc/testsuite/c-c++-common/memcmpeq-7.c | 11 +++
> >>> 13 files changed, 125 insertions(+), 1 deletion(-)
> >>> create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-1.c
> >>> create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-2.c
> >>> create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-3.c
> >>> create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-4.c
> >>> create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-5.c
> >>> create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-6.c
> >>> create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-7.c
> >>>
> >>> diff --git a/gcc/builtins.cc b/gcc/builtins.cc
> >>> index b9d89b409b8..22269318e8c 100644
> >>> --- a/gcc/builtins.cc
> >>> +++ b/gcc/builtins.cc
> >>> @@ -81,6 +81,7 @@ along with GCC; see the file COPYING3.  If not see
> >>> #include "demangle.h"
> >>> #include "gimple-range.h"
> >>> #include "pointer-query.h"
> >>> +#include "opts.h"
> >>>
> >>> struct target_builtins default_target_builtins;
> >>> #if SWITCHABLE_TARGET
> >>> @@ -7410,7 +7411,9 @@ expand_builtin (tree exp, rtx tar

[PATCH] i386: Disallow sibcall when calling ifunc functions with PIC register

2022-06-14 Thread H.J. Lu via Gcc-patches
Disallow siball when calling ifunc functions with PIC register so that
PIC register can be restored.

gcc/

PR target/105960
* config/i386/i386.cc (ix86_function_ok_for_sibcall): Return
false if PIC register is used when calling ifunc functions.

gcc/testsuite/

PR target/105960
* gcc.target/i386/pr105960.c: New test.
---
 gcc/config/i386/i386.cc  |  9 +
 gcc/testsuite/gcc.target/i386/pr105960.c | 19 +++
 2 files changed, 28 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr105960.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 3d189e124e4..1ca7836e11e 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -1015,6 +1015,15 @@ ix86_function_ok_for_sibcall (tree decl, tree exp)
}
 }
 
+  if (decl && ix86_use_pseudo_pic_reg ())
+{
+  /* When PIC register is used, it must be restored after ifunc
+function returns.  */
+   cgraph_node *node = cgraph_node::get (decl);
+   if (node && node->ifunc_resolver)
+return false;
+}
+
   /* Otherwise okay.  That also includes certain types of indirect calls.  */
   return true;
 }
diff --git a/gcc/testsuite/gcc.target/i386/pr105960.c 
b/gcc/testsuite/gcc.target/i386/pr105960.c
new file mode 100644
index 000..db137a1642d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr105960.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-require-ifunc "" } */
+/* { dg-options "-O2 -fpic" } */
+
+__attribute__((target_clones("default","fma")))
+static inline double
+expfull_ref(double x)
+{
+  return __builtin_pow(x, 0.1234);
+}
+
+double
+exp_ref(double x)
+{
+  return expfull_ref(x);
+}
+
+/* { dg-final { scan-assembler "jmp\[ \t\]*expfull_ref@PLT" { target { ! ia32 
} } } } */
+/* { dg-final { scan-assembler "call\[ \t\]*expfull_ref@PLT" { target ia32 } } 
} */
-- 
2.36.1



Re: [PATCH] Add -fextra-libc-function=memcmpeq for __memcmpeq

2022-06-13 Thread H.J. Lu via Gcc-patches
On Mon, Jun 13, 2022 at 3:11 AM Richard Biener
 wrote:
>
> On Tue, Jun 7, 2022 at 9:02 PM H.J. Lu via Gcc-patches
>  wrote:
> >
> > Add -fextra-libc-function=memcmpeq to map
> >
> > extern int __memcmpeq (const void *, const void *, size_t);
> >
> > which was added to GLIBC 2.35, to __builtin_memcmp_eq.
>
> Humm.  Can't we instead use the presence of a declaration
> of __memcmpeq with a GNU standard dialect as this instead of
> adding a weird -fextra-libc-function= option?  Maybe that's even
> reasonable with a non-GNU dialect standard in effect since
> __ prefixed names are in the implementation namespace?

But not all source codes include  and GCC may generate
memcmp directly.  How should we handle these cases?

> Richard.
>
> > gcc/
> >
> > * builtins.cc: Include "opts.h".
> > (expand_builtin): Generate BUILT_IN_MEMCMP_EQ if __memcmpeq is
> > available.
> > * builtins.def (BUILT_IN___MEMCMPEQ): New.
> > * common.opt: Add -fextra-libc-function=.
> > * opts.cc (extra_libc_functions): New.
> > (parse_extra_libc_function): New function.
> > (common_handle_option): Handle -fextra-libc-function=.
> > * opts.h (extra_libc_function_list): New.
> > (extra_libc_functions): Likewise.
> > * doc/invoke.texi: Document -fextra-libc-function=memcmpeq.
> >
> > gcc/testsuite/
> >
> > * c-c++-common/memcmpeq-1.c: New test.
> > * c-c++-common/memcmpeq-2.c: Likewise.
> > * c-c++-common/memcmpeq-3.c: Likewise.
> > * c-c++-common/memcmpeq-4.c: Likewise.
> > * c-c++-common/memcmpeq-5.c: Likewise.
> > * c-c++-common/memcmpeq-6.c: Likewise.
> > * c-c++-common/memcmpeq-7.c: Likewise.
> > ---
> >  gcc/builtins.cc |  5 -
> >  gcc/builtins.def|  4 
> >  gcc/common.opt  |  4 
> >  gcc/doc/invoke.texi |  6 ++
> >  gcc/opts.cc | 23 +++
> >  gcc/opts.h  |  7 +++
> >  gcc/testsuite/c-c++-common/memcmpeq-1.c | 11 +++
> >  gcc/testsuite/c-c++-common/memcmpeq-2.c | 11 +++
> >  gcc/testsuite/c-c++-common/memcmpeq-3.c | 11 +++
> >  gcc/testsuite/c-c++-common/memcmpeq-4.c | 11 +++
> >  gcc/testsuite/c-c++-common/memcmpeq-5.c | 11 +++
> >  gcc/testsuite/c-c++-common/memcmpeq-6.c | 11 +++
> >  gcc/testsuite/c-c++-common/memcmpeq-7.c | 11 +++
> >  13 files changed, 125 insertions(+), 1 deletion(-)
> >  create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-1.c
> >  create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-2.c
> >  create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-3.c
> >  create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-4.c
> >  create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-5.c
> >  create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-6.c
> >  create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-7.c
> >
> > diff --git a/gcc/builtins.cc b/gcc/builtins.cc
> > index b9d89b409b8..22269318e8c 100644
> > --- a/gcc/builtins.cc
> > +++ b/gcc/builtins.cc
> > @@ -81,6 +81,7 @@ along with GCC; see the file COPYING3.  If not see
> >  #include "demangle.h"
> >  #include "gimple-range.h"
> >  #include "pointer-query.h"
> > +#include "opts.h"
> >
> >  struct target_builtins default_target_builtins;
> >  #if SWITCHABLE_TARGET
> > @@ -7410,7 +7411,9 @@ expand_builtin (tree exp, rtx target, rtx subtarget, 
> > machine_mode mode,
> > return target;
> >if (fcode == BUILT_IN_MEMCMP_EQ)
> > {
> > - tree newdecl = builtin_decl_explicit (BUILT_IN_MEMCMP);
> > + tree newdecl = builtin_decl_explicit
> > +   (extra_libc_functions.has_memcmpeq
> > +? BUILT_IN___MEMCMPEQ : BUILT_IN_MEMCMP);
> >   TREE_OPERAND (exp, 1) = build_fold_addr_expr (newdecl);
> > }
> >break;
> > diff --git a/gcc/builtins.def b/gcc/builtins.def
> > index 005976f34e9..eb8d33b16e9 100644
> > --- a/gcc/builtins.def
> > +++ b/gcc/builtins.def
> > @@ -965,6 +965,10 @@ DEF_BUILTIN_STUB (BUILT_IN_ALLOCA_WITH_ALIGN_AND_MAX, 
> > "__builtin_alloca_with_ali
> > equality with zero.  */
> >  DEF_BUILTIN_STUB (BUILT_IN_MEMCMP_EQ, "__builtin_memcmp_eq")
> >
> > +/* Similar to BUILT_IN_MEMCMP_EQ, but is mapp

[PATCH] x86: Require AVX for F16C and VAES

2022-06-10 Thread H.J. Lu via Gcc-patches
Since F16C and VAES are only usable with AVX, require AVX for F16C and
VAES.

OK for master and release branches?

Thanks.

H.J.
---
libgcc/105920
* common/config/i386/cpuinfo.h (get_available_features): Require
AVX for F16C and VAES.
---
 gcc/common/config/i386/cpuinfo.h | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index adc02bc3d98..bbced8a23b9 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -651,8 +651,6 @@ get_available_features (struct __processor_model *cpu_model,
 set_feature (FEATURE_MOVBE);
   if (ecx & bit_AES)
 set_feature (FEATURE_AES);
-  if (ecx & bit_F16C)
-set_feature (FEATURE_F16C);
   if (ecx & bit_RDRND)
 set_feature (FEATURE_RDRND);
   if (ecx & bit_XSAVE)
@@ -663,6 +661,8 @@ get_available_features (struct __processor_model *cpu_model,
set_feature (FEATURE_AVX);
   if (ecx & bit_FMA)
set_feature (FEATURE_FMA);
+  if (ecx & bit_F16C)
+   set_feature (FEATURE_F16C);
 }
 
   /* Get Advanced Features at level 7 (eax = 7, ecx = 0/1). */
@@ -683,6 +683,8 @@ get_available_features (struct __processor_model *cpu_model,
set_feature (FEATURE_AVX2);
  if (ecx & bit_VPCLMULQDQ)
set_feature (FEATURE_VPCLMULQDQ);
+ if (ecx & bit_VAES)
+   set_feature (FEATURE_VAES);
}
   if (ebx & bit_BMI2)
set_feature (FEATURE_BMI2);
@@ -705,8 +707,6 @@ get_available_features (struct __processor_model *cpu_model,
set_feature (FEATURE_PKU);
   if (ecx & bit_RDPID)
set_feature (FEATURE_RDPID);
-  if (ecx & bit_VAES)
-   set_feature (FEATURE_VAES);
   if (ecx & bit_GFNI)
set_feature (FEATURE_GFNI);
   if (ecx & bit_MOVDIRI)
-- 
2.36.1



Re: [PATCH] Add optional __Bfloat16 support

2022-06-10 Thread H.J. Lu via Gcc-patches
On Fri, Jun 10, 2022 at 7:44 AM H.J. Lu  wrote:
>
> On Fri, Jun 10, 2022 at 2:38 AM Florian Weimer  wrote:
> >
> > * liuhongt via Libc-alpha:
> >
> > > +\subsubsection{Special Types}
> > > +
> > > +The \code{__Bfloat16} type uses a 8-bit exponent and 7-bit mantissa.
> > > +It is used for \code{BF16} related intrinsics, it cannot be
>
> Please mention that this is an alternate encoding format for 16-bit floating
> point.  It has the same size and alignment as _Float16.

It also follows the same rules as _Float16 for parameter passing and function
return.

> > > +used with standard C operators.
> >
> > I think it's not necessary to specify whether the type supports certain
> > C operators (surely assignment will work?).  If they are added later,
> > the ABI won't need changing.
> >
>
> If _Bfloat16 becomes a fundamental type, the ABI should be changed to
> move it together with other scalar types.
>
> --
> H.J.



-- 
H.J.


Re: [PATCH] Add optional __Bfloat16 support

2022-06-10 Thread H.J. Lu via Gcc-patches
On Fri, Jun 10, 2022 at 2:38 AM Florian Weimer  wrote:
>
> * liuhongt via Libc-alpha:
>
> > +\subsubsection{Special Types}
> > +
> > +The \code{__Bfloat16} type uses a 8-bit exponent and 7-bit mantissa.
> > +It is used for \code{BF16} related intrinsics, it cannot be

Please mention that this is an alternate encoding format for 16-bit floating
point.  It has the same size and alignment as _Float16.

> > +used with standard C operators.
>
> I think it's not necessary to specify whether the type supports certain
> C operators (surely assignment will work?).  If they are added later,
> the ABI won't need changing.
>

If _Bfloat16 becomes a fundamental type, the ABI should be changed to
move it together with other scalar types.

-- 
H.J.


[PATCH] Add -fextra-libc-function=memcmpeq for __memcmpeq

2022-06-07 Thread H.J. Lu via Gcc-patches
Add -fextra-libc-function=memcmpeq to map

extern int __memcmpeq (const void *, const void *, size_t);

which was added to GLIBC 2.35, to __builtin_memcmp_eq.

gcc/

* builtins.cc: Include "opts.h".
(expand_builtin): Generate BUILT_IN_MEMCMP_EQ if __memcmpeq is
available.
* builtins.def (BUILT_IN___MEMCMPEQ): New.
* common.opt: Add -fextra-libc-function=.
* opts.cc (extra_libc_functions): New.
(parse_extra_libc_function): New function.
(common_handle_option): Handle -fextra-libc-function=.
* opts.h (extra_libc_function_list): New.
(extra_libc_functions): Likewise.
* doc/invoke.texi: Document -fextra-libc-function=memcmpeq.

gcc/testsuite/

* c-c++-common/memcmpeq-1.c: New test.
* c-c++-common/memcmpeq-2.c: Likewise.
* c-c++-common/memcmpeq-3.c: Likewise.
* c-c++-common/memcmpeq-4.c: Likewise.
* c-c++-common/memcmpeq-5.c: Likewise.
* c-c++-common/memcmpeq-6.c: Likewise.
* c-c++-common/memcmpeq-7.c: Likewise.
---
 gcc/builtins.cc |  5 -
 gcc/builtins.def|  4 
 gcc/common.opt  |  4 
 gcc/doc/invoke.texi |  6 ++
 gcc/opts.cc | 23 +++
 gcc/opts.h  |  7 +++
 gcc/testsuite/c-c++-common/memcmpeq-1.c | 11 +++
 gcc/testsuite/c-c++-common/memcmpeq-2.c | 11 +++
 gcc/testsuite/c-c++-common/memcmpeq-3.c | 11 +++
 gcc/testsuite/c-c++-common/memcmpeq-4.c | 11 +++
 gcc/testsuite/c-c++-common/memcmpeq-5.c | 11 +++
 gcc/testsuite/c-c++-common/memcmpeq-6.c | 11 +++
 gcc/testsuite/c-c++-common/memcmpeq-7.c | 11 +++
 13 files changed, 125 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-1.c
 create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-2.c
 create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-3.c
 create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-4.c
 create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-5.c
 create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-6.c
 create mode 100644 gcc/testsuite/c-c++-common/memcmpeq-7.c

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index b9d89b409b8..22269318e8c 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -81,6 +81,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "demangle.h"
 #include "gimple-range.h"
 #include "pointer-query.h"
+#include "opts.h"
 
 struct target_builtins default_target_builtins;
 #if SWITCHABLE_TARGET
@@ -7410,7 +7411,9 @@ expand_builtin (tree exp, rtx target, rtx subtarget, 
machine_mode mode,
return target;
   if (fcode == BUILT_IN_MEMCMP_EQ)
{
- tree newdecl = builtin_decl_explicit (BUILT_IN_MEMCMP);
+ tree newdecl = builtin_decl_explicit
+   (extra_libc_functions.has_memcmpeq
+? BUILT_IN___MEMCMPEQ : BUILT_IN_MEMCMP);
  TREE_OPERAND (exp, 1) = build_fold_addr_expr (newdecl);
}
   break;
diff --git a/gcc/builtins.def b/gcc/builtins.def
index 005976f34e9..eb8d33b16e9 100644
--- a/gcc/builtins.def
+++ b/gcc/builtins.def
@@ -965,6 +965,10 @@ DEF_BUILTIN_STUB (BUILT_IN_ALLOCA_WITH_ALIGN_AND_MAX, 
"__builtin_alloca_with_ali
equality with zero.  */
 DEF_BUILTIN_STUB (BUILT_IN_MEMCMP_EQ, "__builtin_memcmp_eq")
 
+/* Similar to BUILT_IN_MEMCMP_EQ, but is mapped to __memcmpeq only with
+   -fextra-libc-function=memcmpeq.  */
+DEF_EXT_LIB_BUILTIN (BUILT_IN___MEMCMPEQ, "__memcmpeq", 
BT_FN_INT_CONST_PTR_CONST_PTR_SIZE, ATTR_PURE_NOTHROW_NONNULL_LEAF)
+
 /* An internal version of strcmp/strncmp, used when the result is only 
tested for equality with zero.  */
 DEF_BUILTIN_STUB (BUILT_IN_STRCMP_EQ, "__builtin_strcmp_eq")
diff --git a/gcc/common.opt b/gcc/common.opt
index 7ca0cceed82..7a7631682b0 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1587,6 +1587,10 @@ Enum(excess_precision) String(standard) 
Value(EXCESS_PRECISION_STANDARD)
 EnumValue
 Enum(excess_precision) String(16) Value(EXCESS_PRECISION_FLOAT16)
 
+fextra-libc-function=
+Common Driver Joined
+Specify the extra function in the C library.
+
 ; Whether we permit the extended set of values for FLT_EVAL_METHOD
 ; introduced in ISO/IEC TS 18661-3, or limit ourselves to those in C99/C11.
 fpermitted-flt-eval-methods=
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 8cd5bdddc5d..fe1e3709953 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -676,6 +676,7 @@ Objective-C and Objective-C++ Dialects}.
 -ffixed-@var{reg}  -fexceptions @gol
 -fnon-call-exceptions  -fdelete-dead-exceptions  -funwind-tables @gol
 -fasynchronous-unwind-tables @gol
+-fextra-libc-function=memcmpeq @gol
 -fno-gnu-unique @gol
 -finhibit-size-directive  -fcommon  -fno-ident @gol
 -fpcc-struct-return  -fpic  -fPIC  -fpie  -fPIE  -fno-plt @gol
@@ -17250,6 +17251,11 @@ Generate 

Re: [PATCH] x86: harmonize __builtin_ia32_psadbw*() types

2022-06-06 Thread H.J. Lu via Gcc-patches
On Sun, Jun 5, 2022 at 7:27 PM Hongtao Liu via Gcc-patches
 wrote:
>
> On Mon, Jun 6, 2022 at 3:17 AM Uros Bizjak via Gcc-patches
>  wrote:
> >
> > On Thu, Jun 2, 2022 at 5:04 PM Jan Beulich  wrote:
> > >
> > > The 64-bit, 128-bit, and 512-bit variants have VDI return type, in
> > > line with instruction behavior. Make the 256-bit builtin match, thus
> > > also making it match the insn it expands to (using VI8_AVX2_AVX512BW).
> > >
> > > gcc/
> > >
> > > * config/i386/i386-builtin.def (__builtin_ia32_psadbw256):
> > > Change type.
> > > * config/i386/i386-builtin-types.def: New function type
> > > (V4DI, V32QI, V32QI).
> > > * config/i386/i386-expand.cc (ix86_expand_args_builtin): Handle
> > > V4DI_FTYPE_V32QI_V32QI.
> >
> > LGTM, but please let HJ have the final approval.
> I think it was just a typo and not intentional, so Ok for the trunk.

LGTM too.

Thanks.

> >
> > Uros.
> >
> > >
> > > --- a/gcc/config/i386/i386-builtin.def
> > > +++ b/gcc/config/i386/i386-builtin.def
> > > @@ -1217,7 +1217,7 @@ BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR
> > >  BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_mulv8si3, 
> > > "__builtin_ia32_pmulld256"  , IX86_BUILTIN_PMULLD256  , UNKNOWN, (int) 
> > > V8SI_FTYPE_V8SI_V8SI)
> > >  BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_vec_widen_umult_even_v8si, 
> > > "__builtin_ia32_pmuludq256", IX86_BUILTIN_PMULUDQ256, UNKNOWN, (int) 
> > > V4DI_FTYPE_V8SI_V8SI)
> > >  BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_iorv4di3, 
> > > "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) 
> > > V4DI_FTYPE_V4DI_V4DI)
> > > -BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_psadbw, 
> > > "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) 
> > > V16HI_FTYPE_V32QI_V32QI)
> > > +BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_psadbw, 
> > > "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) 
> > > V4DI_FTYPE_V32QI_V32QI)
> > >  BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_pshufbv32qi3, 
> > > "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) 
> > > V32QI_FTYPE_V32QI_V32QI)
> > >  BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_pshufdv3, 
> > > "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) 
> > > V8SI_FTYPE_V8SI_INT)
> > >  BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_pshufhwv3, 
> > > "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) 
> > > V16HI_FTYPE_V16HI_INT)
> > > --- a/gcc/config/i386/i386-builtin-types.def
> > > +++ b/gcc/config/i386/i386-builtin-types.def
> > > @@ -516,6 +516,7 @@ DEF_FUNCTION_TYPE (V8DI, V8DI, V2DI, INT
> > >  DEF_FUNCTION_TYPE (V8DI, V8DI, V2DI, INT, V8DI, UQI)
> > >  DEF_FUNCTION_TYPE (V8DI, V8DI, V4DI, INT, V8DI, UQI)
> > >  DEF_FUNCTION_TYPE (V4DI, V8SI, V8SI)
> > > +DEF_FUNCTION_TYPE (V4DI, V32QI, V32QI)
> > >  DEF_FUNCTION_TYPE (V8DI, V64QI, V64QI)
> > >  DEF_FUNCTION_TYPE (V4DI, V4DI, V2DI)
> > >  DEF_FUNCTION_TYPE (V4DI, PCV4DI, V4DI)
> > > --- a/gcc/config/i386/i386-expand.cc
> > > +++ b/gcc/config/i386/i386-expand.cc
> > > @@ -10359,6 +10359,7 @@ ix86_expand_args_builtin (const struct b
> > >  case V8SI_FTYPE_V16HI_V16HI:
> > >  case V4DI_FTYPE_V4DI_V4DI:
> > >  case V4DI_FTYPE_V8SI_V8SI:
> > > +case V4DI_FTYPE_V32QI_V32QI:
> > >  case V8DI_FTYPE_V64QI_V64QI:
> > >if (comparison == UNKNOWN)
> > > return ix86_expand_binop_builtin (icode, exp, target);
> > >
>
>
>
> --
> BR,
> Hongtao



-- 
H.J.


Re: [PATCH v3] RISC-V: Implement C[LT]Z_DEFINED_VALUE_AT_ZERO

2022-06-03 Thread H.J. Lu via Gcc-patches
On Fri, May 13, 2022 at 1:17 PM Philipp Tomsich
 wrote:
>
> The Zbb support has introduced ctz and clz to the backend, but some
> transformations in GCC need to know what the value of c[lt]z at zero
> is. This affects how the optab is generated and may suppress use of
> CLZ/CTZ in tree passes.
>
> Among other things, this is needed for the transformation of
> table-based ctz-implementations, such as in deepsjeng, to work
> (see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90838).
>
> Prior to this change, the test case from PR90838 would compile to
> on RISC-V targets with Zbb:
>   myctz:
> lui a4,%hi(.LC0)
> ld  a4,%lo(.LC0)(a4)
> neg a5,a0
> and a5,a5,a0
> mul a5,a5,a4
> lui a4,%hi(.LANCHOR0)
> addia4,a4,%lo(.LANCHOR0)
> srlia5,a5,58
> sh2add  a5,a5,a4
> lw  a0,0(a5)
> ret
>
> After this change, we get:
>   myctz:
> ctz a0,a0
> andia0,a0,63
> ret
>
> Testing this with deepsjeng_r (from SPEC 2017) against QEMU, this
> shows a clear reduction in dynamic instruction count:
>  - before  1961888067076
>  - after   1907928279874 (2.75% reduction)
>
> This also merges the various target-specific test-cases (for x86-64,
> aarch64 and riscv) within gcc.dg/pr90838.c.
>
> This extends the macros (i.e., effective-target keywords) used in
> testing (lib/target-supports.exp) to reliably distinguish between RV32
> and RV64 via __riscv_xlen (i.e., the integer register bitwidth) :
> testing for ILP32 could be misleading (as ILP32 is a valid memory
> model for 64bit systems).
>
> gcc/ChangeLog:
>
> * config/riscv/riscv.h (CLZ_DEFINED_VALUE_AT_ZERO): Implement.
> (CTZ_DEFINED_VALUE_AT_ZERO): Same.
> * doc/sourcebuild.texi: add documentation for RISC-V specific
> test target keywords
>
> gcc/testsuite/ChangeLog:
>
> * gcc.dg/pr90838.c: Add additional flags (dg-additional-options)
>   when compiling for riscv64 and subsume gcc.target/aarch64/pr90838.c
>   and gcc.target/i386/pr95863-2.c.
> * gcc.target/riscv/zbb-ctz.c: New test.
> * gcc.target/aarch64/pr90838.c: Removed.
> * gcc.target/i386/pr95863-2.c: Removed.
> * lib/target-supports.exp: Recognize RV32 or RV64 via XLEN
>
> Signed-off-by: Philipp Tomsich 
> Signed-off-by: Manolis Tsamis 
> Co-developed-by: Manolis Tsamis 
>
> ---
> Changes in v3:
> - Address nit from Kito (use rv64 and rv32 on gcc.dg/pr90838.c
>   consistently.
>
> Changes in v2:
> - Address review comments from Palmer (merging testcases)
> - Merge the different target-specific testcases for CLZ into one
> - Add RV32 tests
> - Fix pr90383.c testcase for x86_64
>
>  gcc/config/riscv/riscv.h   |  5 ++
>  gcc/doc/sourcebuild.texi   | 12 
>  gcc/testsuite/gcc.dg/pr90838.c | 25 +
>  gcc/testsuite/gcc.target/aarch64/pr90838.c | 64 --
>  gcc/testsuite/gcc.target/i386/pr95863-2.c  | 27 -
>  gcc/testsuite/lib/target-supports.exp  | 30 ++
>  6 files changed, 72 insertions(+), 91 deletions(-)
>  delete mode 100644 gcc/testsuite/gcc.target/aarch64/pr90838.c
>  delete mode 100644 gcc/testsuite/gcc.target/i386/pr95863-2.c
>
> diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h
> index 8a4d2cf7f85..b191606edb4 100644
> --- a/gcc/config/riscv/riscv.h
> +++ b/gcc/config/riscv/riscv.h
> @@ -1004,4 +1004,9 @@ extern void riscv_remove_unneeded_save_restore_calls 
> (void);
>
>  #define HARD_REGNO_RENAME_OK(FROM, TO) riscv_hard_regno_rename_ok (FROM, TO)
>
> +#define CLZ_DEFINED_VALUE_AT_ZERO(MODE, VALUE) \
> +  ((VALUE) = GET_MODE_UNIT_BITSIZE (MODE), 2)
> +#define CTZ_DEFINED_VALUE_AT_ZERO(MODE, VALUE) \
> +  ((VALUE) = GET_MODE_UNIT_BITSIZE (MODE), 2)
> +
>  #endif /* ! GCC_RISCV_H */
> diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi
> index 613ac29967b..71c04841df2 100644
> --- a/gcc/doc/sourcebuild.texi
> +++ b/gcc/doc/sourcebuild.texi
> @@ -2420,6 +2420,18 @@ PowerPC target pre-defines macro _ARCH_PWR9 which 
> means the @code{-mcpu}
>  setting is Power9 or later.
>  @end table
>
> +@subsection RISC-V specific attributes
> +
> +@table @code
> +
> +@item rv32
> +Test system has an integer register width of 32 bits.
> +
> +@item rv64
> +Test system has an integer register width of 64 bits.
> +
> +@end table
> +
>  @subsubsection Other hardware attributes
>
>  @c Please keep this table sorted alphabetically.
> diff --git a/gcc/testsuite/gcc.dg/pr90838.c b/gcc/testsuite/gcc.dg/pr90838.c
> index 41c5dab9a5c..7502b846346 100644
> --- a/gcc/testsuite/gcc.dg/pr90838.c
> +++ b/gcc/testsuite/gcc.dg/pr90838.c
> @@ -1,5 +1,8 @@
>  /* { dg-do compile } */
>  /* { dg-options "-O2 -fdump-tree-forwprop2-details" } */
> +/* { dg-additional-options "-mbmi" { target { { i?86-*-* x86_64-*-* } && { ! 
> { ia32 } } } } } */
> +/* { dg-additional-options "-march=rv64gc_zbb" { 

Re: [PATCH v4] DSE: Use the constant store source if possible

2022-06-01 Thread H.J. Lu via Gcc-patches
On Wed, Jun 1, 2022 at 12:20 AM Richard Sandiford
 wrote:
>
> "H.J. Lu"  writes:
> > On Mon, May 30, 2022 at 09:35:43AM +0100, Richard Sandiford wrote:
> >> "H.J. Lu"  writes:
> >> > ---
> >> > RTL DSE tracks redundant constant stores within a basic block.  When RTL
> >> > loop invariant motion hoists a constant initialization out of the loop
> >> > into a separate basic block, the constant store value becomes unknown
> >> > within the original basic block.  When recording store for RTL DSE, check
> >> > if the source register is set only once to a constant by a non-partial
> >> > unconditional load.  If yes, record the constant as the constant store
> >> > source.  It eliminates unrolled zero stores after memset 0 in a loop
> >> > where a vector register is used as the zero store source.
> >> >
> >> > Extract find_single_def_src from loop-iv.cc and move it to df-core.cc:
> >> >
> >> > 1. Rename to df_find_single_def_src.
> >> > 2. Change the argument to rtx and use rtx_equal_p.
> >> > 3. Return null for partial or conditional defs.
> >> >
> >> > gcc/
> >> >
> >> >PR rtl-optimization/105638
> >> >* df-core.cc (df_find_single_def_sr): Moved and renamed from
> >> >find_single_def_src in loop-iv.cc.  Change the argument to rtx
> >> >and use rtx_equal_p.  Return null for partial or conditional
> >> >defs.
> >> >* df.h (df_find_single_def_src): New prototype.
> >> >* dse.cc (record_store): Use the constant source if the source
> >> >register is set only once.
> >> >* loop-iv.cc (find_single_def_src): Moved to df-core.cc.
> >> >(replace_single_def_regs): Replace find_single_def_src with
> >> >df_find_single_def_src.
> >> >
> >> > gcc/testsuite/
> >> >
> >> >PR rtl-optimization/105638
> >> >* g++.target/i386/pr105638.C: New test.
> >> > ---
> >> >  gcc/df-core.cc   | 44 +++
> >> >  gcc/df.h |  1 +
> >> >  gcc/dse.cc   | 14 
> >> >  gcc/loop-iv.cc   | 45 +---
> >> >  gcc/testsuite/g++.target/i386/pr105638.C | 44 +++
> >> >  5 files changed, 104 insertions(+), 44 deletions(-)
> >> >  create mode 100644 gcc/testsuite/g++.target/i386/pr105638.C
> >> >
> >> > diff --git a/gcc/df-core.cc b/gcc/df-core.cc
> >> > index a901b84878f..f9b4de8eb7a 100644
> >> > --- a/gcc/df-core.cc
> >> > +++ b/gcc/df-core.cc
> >> > @@ -2009,6 +2009,50 @@ df_reg_used (rtx_insn *insn, rtx reg)
> >> >return df_find_use (insn, reg) != NULL;
> >> >  }
> >> >
> >> > +/* If REG has a single definition, return its known value, otherwise 
> >> > return
> >> > +   null.  */
> >> > +
> >> > +rtx
> >> > +df_find_single_def_src (rtx reg)
> >> > +{
> >> > +  rtx src = NULL_RTX;
> >> > +
> >> > +  /* Don't look through unbounded number of single definition REG 
> >> > copies,
> >> > + there might be loops for sources with uninitialized variables.  */
> >> > +  for (int cnt = 0; cnt < 128; cnt++)
> >> > +{
> >> > +  df_ref adef = DF_REG_DEF_CHAIN (REGNO (reg));
> >> > +  if (adef == NULL || DF_REF_NEXT_REG (adef) != NULL
> >> > +|| DF_REF_IS_ARTIFICIAL (adef)
> >> > +|| (DF_REF_FLAGS (adef)
> >> > +& (DF_REF_PARTIAL | DF_REF_CONDITIONAL)))
> >> > +  return NULL_RTX;
> >> > +
> >> > +  rtx set = single_set (DF_REF_INSN (adef));
> >> > +  if (set == NULL || !rtx_equal_p (SET_DEST (set), reg))
> >> > +  return NULL_RTX;
> >> > +
> >> > +  rtx note = find_reg_equal_equiv_note (DF_REF_INSN (adef));
> >> > +  if (note && function_invariant_p (XEXP (note, 0)))
> >> > +  {
> >> > +src = XEXP (note, 0);
> >> > +break;
> >> > +  }
> >>
> >> Seems simpler to return this directly, rather than break and then
> >> check function_invariant_p again.
> >
> > Fixed.
> >
> >>
> >> > +  src = SET_SRC (set);
> >> > +
> >> > +  if (REG_P (src))
> >> > +  {
> >> > +reg = src;
> >> > +continue;
> >> > +  }
> >> > +  break;
> >> > +}
> >> > +  if (!function_invariant_p (src))
> >> > +return NULL_RTX;
> >> > +
> >> > +  return src;
> >> > +}
> >> > +
> >> >
> >> >  
> >> > /*
> >> > Debugging and printing functions.
> >> > diff --git a/gcc/df.h b/gcc/df.h
> >> > index bd329205d08..71e249ad20a 100644
> >> > --- a/gcc/df.h
> >> > +++ b/gcc/df.h
> >> > @@ -991,6 +991,7 @@ extern df_ref df_find_def (rtx_insn *, rtx);
> >> >  extern bool df_reg_defined (rtx_insn *, rtx);
> >> >  extern df_ref df_find_use (rtx_insn *, rtx);
> >> >  extern bool df_reg_used (rtx_insn *, rtx);
> >> > +extern rtx df_find_single_def_src (rtx);
> >> >  extern void df_worklist_dataflow (struct dataflow *,bitmap, int *, int);
> >> >  extern void df_print_regset (FILE *file, const_bitmap r);
> >> >  extern void df_print_word_regset (FILE *file, const_bitmap r);
> >> > diff --git a/gcc/dse.cc b/gcc/dse.cc
> >> > index 

Re: [PATCH] Update {skylake,icelake,alderlake}_cost to add a bit preference to vector store.

2022-06-01 Thread H.J. Lu via Gcc-patches
On Tue, May 31, 2022 at 10:06 PM Cui,Lili  wrote:
>
> This patch is to update {skylake,icelake,alderlake}_cost to add a bit 
> preference to vector store.
> Since the interger vector construction cost has changed, we need to adjust 
> the load and store costs for intel processers.
>
> With the patch applied
> 538.imagic_r:gets ~6% improvement on ADL for multicopy.
> 525.x264_r  :gets ~2% improvement on ADL and ICX for multicopy.
> with no measurable changes for other benchmarks.
>
> Bootstrapped and regtested on x86_64-linux-gnu{-m32,}. Ok for trunk?
>
> Thanks,
> Lili.
>
> gcc/ChangeLog
>
> PR target/105493
> * config/i386/x86-tune-costs.h (skylake_cost): Raise the gpr load cost
> from 4 to 6 and gpr store cost from 6 to 8. Change SSE loads and
> unaligned loads cost from {6, 6, 6, 10, 20} to {8, 8, 8, 8, 16}.
> (icelake_cost): Ditto.
> (alderlake_cost): Raise the gpr store cost from 6 to 8 and SSE loads,
> stores and unaligned stores cost from {6, 6, 6, 10, 15} to
> {8, 8, 8, 10, 15}.
>
> gcc/testsuite/
>
> PR target/105493
> * gcc.target/i386/pr91446.c: Adjust to expect vectorization
> * gcc.target/i386/pr99881.c: XFAIL.
> ---
>  gcc/config/i386/x86-tune-costs.h| 26 -
>  gcc/testsuite/gcc.target/i386/pr91446.c |  2 +-
>  gcc/testsuite/gcc.target/i386/pr99881.c |  2 +-
>  3 files changed, 15 insertions(+), 15 deletions(-)
>
> diff --git a/gcc/config/i386/x86-tune-costs.h 
> b/gcc/config/i386/x86-tune-costs.h
> index ea34a939c68..6c9066c84cc 100644
> --- a/gcc/config/i386/x86-tune-costs.h
> +++ b/gcc/config/i386/x86-tune-costs.h
> @@ -1897,15 +1897,15 @@ struct processor_costs skylake_cost = {
>8,   /* "large" insn */
>17,  /* MOVE_RATIO */
>17,  /* CLEAR_RATIO */
> -  {4, 4, 4},   /* cost of loading integer registers
> +  {6, 6, 6},   /* cost of loading integer registers
>in QImode, HImode and SImode.
>Relative to reg-reg move (2).  */
> -  {6, 6, 6},   /* cost of storing integer registers 
> */
> -  {6, 6, 6, 10, 20},   /* cost of loading SSE register
> +  {8, 8, 8},   /* cost of storing integer registers 
> */
> +  {8, 8, 8, 8, 16},/* cost of loading SSE register
>in 32bit, 64bit, 128bit, 256bit 
> and 512bit */
>{8, 8, 8, 8, 16},/* cost of storing SSE register
>in 32bit, 64bit, 128bit, 256bit 
> and 512bit */
> -  {6, 6, 6, 10, 20},   /* cost of unaligned loads.  */
> +  {8, 8, 8, 8, 16},/* cost of unaligned loads.  */
>{8, 8, 8, 8, 16},/* cost of unaligned stores.  */
>2, 2, 4, /* cost of moving XMM,YMM,ZMM 
> register */
>6,   /* cost of moving SSE register to 
> integer.  */
> @@ -2023,15 +2023,15 @@ struct processor_costs icelake_cost = {
>8,   /* "large" insn */
>17,  /* MOVE_RATIO */
>17,  /* CLEAR_RATIO */
> -  {4, 4, 4},   /* cost of loading integer registers
> +  {6, 6, 6},   /* cost of loading integer registers
>in QImode, HImode and SImode.
>Relative to reg-reg move (2).  */
> -  {6, 6, 6},   /* cost of storing integer registers 
> */
> -  {6, 6, 6, 10, 20},   /* cost of loading SSE register
> +  {8, 8, 8},   /* cost of storing integer registers 
> */
> +  {8, 8, 8, 8, 16},/* cost of loading SSE register
>in 32bit, 64bit, 128bit, 256bit 
> and 512bit */
>{8, 8, 8, 8, 16},/* cost of storing SSE register
>in 32bit, 64bit, 128bit, 256bit 
> and 512bit */
> -  {6, 6, 6, 10, 20},   /* cost of unaligned loads.  */
> +  {8, 8, 8, 8, 16},/* cost of unaligned loads.  */
>{8, 8, 8, 8, 16},/* cost of unaligned stores.  */
>2, 2, 4, /* cost of moving XMM,YMM,ZMM 
> register */
>6,   /* cost of moving SSE register to 
> integer.  */
> @@ -2146,13 +2146,13 @@ struct processor_costs alderlake_cost = {
>{6, 6, 6},   /* cost of loading integer registers
>in QImode, HImode and SImode.
>

  1   2   3   4   5   6   7   8   9   10   >