Hi all,

This patch adds basic support for SVE stack clash protection.
It is a first implementation and will use a loop to do the
probing and stack adjustments.

An example sequence is:

        .cfi_startproc
        mov     x15, sp
        cntb    x16, all, mul #11
        add     x16, x16, 304
        .cfi_def_cfa_register 15
.SVLPSRL0:
        cmp     x16, 4096
        b.lt    .BRRCHK0
        sub     sp, sp, 4096
        str     xzr, [sp, 1024]
        sub     x16, x16, 4096
        b       .SVLPSRL0
.BRRCHK0:
        sub     sp, sp, x16
        cmp     sp, 1024
        b.lt    .BERCHK0
        str     xzr, [sp, 1024]
.BERCHK0:
        .cfi_escape 0xf,0xc,0x8f,0,0x92,0x2e,0,0x8,0x58,0x1e,0x23,0xb0,0x2,0x22

This has about the same semantics as alloca.


Bootstrapped Regtested on aarch64-none-linux-gnu and no issues in sve testsuite.
Target was tested with stack clash on and off by default.

Ok for trunk?

Thanks,
Tamar

gcc/
2018-08-28  Tamar Christina  <tamar.christ...@arm.com>

        PR target/86486
        * config/aarch64/aarch64-protos.h 
(aarch64_output_probe_sve_stack_clash): New.
        * config/aarch64/aarch64.c (aarch64_output_probe_sve_stack_clash): New.
        (aarch64_allocate_and_probe_stack_space): Add SVE specific section.
        * config/aarch64/aarch64.md (probe_sve_stack_clash): New.

gcc/testsuite/
2018-08-28  Tamar Christina  <tamar.christ...@arm.com>

        PR target/86486
        * gcc.target/aarch64/stack-check-prologue-12.c: New test
        * gcc.target/aarch64/stack-check-cfa-3.c: New test.

-- 
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index ef95fc829b83886e2ff00e4664e31af916e99b0c..e2d8734a8d5e513588e3b0318e9c67fdaebdf0d4 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -453,6 +453,7 @@ void aarch64_asm_output_labelref (FILE *, const char *);
 void aarch64_cpu_cpp_builtins (cpp_reader *);
 const char * aarch64_gen_far_branch (rtx *, int, const char *, const char *);
 const char * aarch64_output_probe_stack_range (rtx, rtx);
+const char * aarch64_output_probe_sve_stack_clash (rtx, rtx, rtx, rtx);
 void aarch64_err_no_fpadvsimd (machine_mode);
 void aarch64_expand_epilogue (bool);
 void aarch64_expand_mov_immediate (rtx, rtx, rtx (*) (rtx, rtx) = 0);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 06451f38b11822ea77323438fe8c7e373eb9e614..e7efde79bb111e820f4df44a276f6f73070ecd17 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -3970,6 +3970,90 @@ aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
   return "";
 }
 
+/* Emit the probe loop for doing stack clash probes and stack adjustments for
+   SVE.  This emits probes from BASE to BASE + ADJUSTMENT based on a guard size
+   of GUARD_SIZE and emits a probe when at least LIMIT bytes are allocated.  By
+   the end of this function BASE = BASE + ADJUSTMENT.  */
+
+const char *
+aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment, rtx limit,
+				      rtx guard_size)
+{
+  /* This function is not allowed to use any instruction generation function
+     like gen_ and friends.  If you do you'll likely ICE during CFG validation,
+     so instead emit the code you want using output_asm_insn.  */
+  gcc_assert (flag_stack_clash_protection);
+  gcc_assert (CONST_INT_P (limit) && CONST_INT_P (guard_size));
+  gcc_assert (aarch64_uimm12_shift (INTVAL (limit)));
+  gcc_assert (aarch64_uimm12_shift (INTVAL (guard_size)));
+
+  static int labelno = 0;
+  char loop_start_lab[32];
+  char loop_res_lab[32];
+  char loop_end_lab[32];
+  rtx xops[2];
+
+  ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSRL", labelno);
+  ASM_GENERATE_INTERNAL_LABEL (loop_res_lab, "BRRCHK", labelno);
+  ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "BERCHK", labelno++);
+
+  /* Emit loop start label.  */
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
+
+  /* Test if ADJUSTMENT < GUARD_SIZE.  */
+  xops[0] = adjustment;
+  xops[1] = guard_size;
+  output_asm_insn ("cmp\t%0, %1", xops);
+
+  /* Branch to residual loop if it is.  */
+  fputs ("\tb.lt\t", asm_out_file);
+  assemble_name_raw (asm_out_file, loop_res_lab);
+  fputc ('\n', asm_out_file);
+
+  /* BASE = BASE - GUARD_SIZE.  */
+  xops[0] = base;
+  xops[1] = guard_size;
+  output_asm_insn ("sub\t%0, %0, %1", xops);
+
+  /* Probe at BASE + LIMIT.  */
+  xops[1] = limit;
+  output_asm_insn ("str\txzr, [%0, %1]", xops);
+
+  /* ADJUSTMENT = ADJUSTMENT - GUARD_SIZE.  */
+  xops[0] = adjustment;
+  xops[1] = guard_size;
+  output_asm_insn ("sub\t%0, %0, %1", xops);
+
+  /* Branch to loop start.  */
+  fputs ("\tb\t", asm_out_file);
+  assemble_name_raw (asm_out_file, loop_start_lab);
+  fputc ('\n', asm_out_file);
+
+  /* Emit residual check label.  */
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_res_lab);
+
+  /* BASE = BASE - ADJUSTMENT.  */
+  xops[0] = base;
+  xops[1] = adjustment;
+  output_asm_insn ("sub\t%0, %0, %1", xops);
+
+  /* Test if BASE < LIMIT.  */
+  xops[1] = limit;
+  output_asm_insn ("cmp\t%0, %1", xops);
+
+  /* Branch to end.  */
+  fputs ("\tb.lt\t", asm_out_file);
+  assemble_name_raw (asm_out_file, loop_end_lab);
+  fputc ('\n', asm_out_file);
+
+  /* Probe at BASE + LIMIT.  */
+  output_asm_insn ("str\txzr, [%0, %1]", xops);
+
+  /* No probe leave.  */
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
+  return "";
+}
+
 /* Determine whether a frame chain needs to be generated.  */
 static bool
 aarch64_needs_frame_chain (void)
@@ -4826,22 +4910,30 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
 	}
     }
 
-  HOST_WIDE_INT size;
+  /* GCC's initialization analysis is broken so initialize size.  */
+  HOST_WIDE_INT size = 0;
   /* If SIZE is not large enough to require probing, just adjust the stack and
      exit.  */
-  if (!poly_size.is_constant (&size)
-      || known_lt (poly_size, min_probe_threshold)
+  if ((poly_size.is_constant (&size)
+       && known_lt (poly_size, min_probe_threshold))
       || !flag_stack_clash_protection)
     {
       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
       return;
     }
 
-  if (dump_file)
+  if (dump_file && poly_size.is_constant ())
     fprintf (dump_file,
 	     "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC " bytes"
 	     ", probing will be required.\n", size);
 
+  if (dump_file && !poly_size.is_constant ())
+    {
+      fprintf (dump_file, "Stack clash SVE prologue: ");
+      dump_dec (MSG_NOTE, poly_size);
+      fprintf (dump_file, " bytes, dynamic probing will be required.\n");
+    }
+
   /* Round size to the nearest multiple of guard_size, and calculate the
      residual as the difference between the original size and the rounded
      size.  */
@@ -4850,7 +4942,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
 
   /* We can handle a small number of allocations/probes inline.  Otherwise
      punt to a loop.  */
-  if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
+  if (poly_size.is_constant ()
+      && rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
     {
       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
 	{
@@ -4861,7 +4954,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
 	}
       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
     }
-  else
+  else if (poly_size.is_constant ())
     {
       /* Compute the ending address.  */
       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
@@ -4910,6 +5003,48 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
       emit_insn (gen_blockage ());
       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
     }
+  else
+    {
+      /* First calculate the amount of bytes we're actually spilling.  */
+      aarch64_add_offset (Pmode, temp1, CONST0_RTX (GET_MODE (temp1)),
+			  poly_size, temp1, temp2, false, true);
+
+      rtx_insn *insn = get_last_insn ();
+
+      if (frame_related_p)
+	{
+	  /* This is done to provide unwinding information for the stack
+	     adjustments we're about to do, however to prevent the optimizers
+	     from removing the R15 move and leaving the CFA note (which would be
+	     very wrong) we tie the old and new stack pointer together.
+	     The tie will expand to nothing but the optimizers will not touch
+	     the instruction.  */
+	  rtx stack_ptr_copy = gen_rtx_REG (Pmode, R15_REGNUM);
+	  emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
+	  emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
+
+	  /* We want the CFA independent of the stack pointer for the
+	     duration of the loop.  */
+	  add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	}
+
+      rtx probe_const = gen_rtx_CONST_INT (Pmode, STACK_CLASH_CALLER_GUARD);
+      rtx guard_const = gen_rtx_CONST_INT (Pmode, guard_size);
+
+      insn = emit_insn (gen_probe_sve_stack_clash (stack_pointer_rtx,
+						   stack_pointer_rtx, temp1,
+						   probe_const, guard_const));
+
+      /* Now reset the CFA register if needed.  */
+      if (frame_related_p)
+	{
+	  add_reg_note (insn, REG_CFA_DEF_CFA,
+			gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+				      gen_int_mode (poly_size, Pmode)));
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	}
+    }
 
   /* Handle any residuals.  Residuals of at least min_probe_threshold have to
      be probed.  This maintains the requirement that each page is probed at
@@ -4922,7 +5057,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
      probed by the saving of FP/LR either by this function or any callees.  If
      we don't have any callees then we won't have more stack adjustments and so
      are still safe.  */
-  if (residual)
+  if (poly_size.is_constant () && residual)
     {
       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
       if (residual >= min_probe_threshold)
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index b8da13f14fa9990e8fdc3c71ed407c8afc65a324..4901f55478eb0ea26a36f15d51aaf9779a8efaf4 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -6464,6 +6464,25 @@
   [(set_attr "length" "32")]
 )
 
+;; This instruction is used to generate the stack clash stack adjustment and
+;; probing loop.  We can't change the control flow during prologue and epilogue
+;; code generation.  So we must emit a volatile unspec and expand it later on.
+
+(define_insn "probe_sve_stack_clash"
+  [(set (match_operand:DI 0 "register_operand" "=rk")
+	(unspec_volatile:DI [(match_operand:DI 1 "register_operand" "0")
+			     (match_operand:DI 2 "register_operand" "r")
+			     (match_operand:DI 3 "aarch64_plus_immediate" "L")
+			     (match_operand:DI 4 "aarch64_plus_immediate" "L")]
+			      UNSPECV_PROBE_STACK_RANGE))]
+  "TARGET_SVE"
+{
+  return aarch64_output_probe_sve_stack_clash (operands[0], operands[2],
+					       operands[3], operands[4]);
+}
+  [(set_attr "length" "40")]
+)
+
 ;; Named pattern for expanding thread pointer reference.
 (define_expand "get_thread_pointerdi"
   [(match_operand:DI 0 "register_operand" "=r")]
diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c b/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c
new file mode 100644
index 0000000000000000000000000000000000000000..6ea87392843e4b9561cf6d43ffee57887db62e4e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-cfa-3.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8-a+sve -fstack-clash-protection --param stack-clash-protection-guard-size=16 -funwind-tables -ftree-vectorize" } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+
+#include <stdint.h>
+
+#define N 20040
+
+void __attribute__ ((noinline, noclone))
+test (int8_t *restrict dest, int8_t *restrict src)
+{
+  for (int i = 0; i < N; i+=8)
+    {
+      dest[i] += src[i * 4];
+      dest[i+1] += src[i * 4 + 1];
+      dest[i+2] += src[i * 4 + 2];
+      dest[i+3] += src[i * 4 + 3];
+      dest[i+4] += src[i * 4 + 4];
+      dest[i+5] += src[i * 4 + 5];
+      dest[i+6] += src[i * 4 + 6];
+      dest[i+7] += src[i * 4 + 7];
+    }
+}
+/* { dg-final { scan-assembler-times {mov\tx15, sp} 1 } } */
+/* { dg-final { scan-assembler-times {\.cfi_def_cfa_register 15} 1 } } */
+/* { dg-final { scan-assembler-times {\.cfi_escape 0xf,0xc,0x8f,0,0x92,0x2e,0,0x8,0x58,0x1e,0x23,0xb0,0x2,0x22} 1 } } */
+
+/* Checks that the CFA notes are correct for every sp adjustment, but we also
+   need to make sure we can unwind correctly before the frame is set up.  So
+   check that we're emitting r15 with a copy of sp an setting the CFA there.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-12.c b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-12.c
new file mode 100644
index 0000000000000000000000000000000000000000..d66a2c19f4aec7b1121651da315442303cc1ed54
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stack-check-prologue-12.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target supports_stack_clash_protection } */
+/* { dg-options "-O2 -march=armv8-a+sve -fstack-clash-protection --param stack-clash-protection-guard-size=16 -ftree-vectorize" } */
+
+
+#include <stdint.h>
+
+#define N 20040
+
+void __attribute__ ((noinline, noclone))
+test (int8_t *restrict dest, int8_t *restrict src)
+{
+  for (int i = 0; i < N; i+=8)
+    {
+      dest[i] += src[i * 4];
+      dest[i+1] += src[i * 4 + 1];
+      dest[i+2] += src[i * 4 + 2];
+      dest[i+3] += src[i * 4 + 3];
+      dest[i+4] += src[i * 4 + 4];
+      dest[i+5] += src[i * 4 + 5];
+      dest[i+6] += src[i * 4 + 6];
+      dest[i+7] += src[i * 4 + 7];
+    }
+}
+
+
+/* { dg-final { scan-assembler-times {str\s+xzr, \[sp, 1024\]} 2 } } */
+/* { dg-final { scan-assembler-times {cmp\s+sp, 1024} 1 } } */
+/* { dg-final { scan-assembler-times {cmp\s+x[0-9]+, 65536} 1 } } */
+
+/* SVE spill, requires probing as vector size is unknown at compile time.
+   Dynamic loop expected.  */

Reply via email to