varunkumare99 created this revision.
varunkumare99 added reviewers: tnfchris, efriedma, kristof.beyls, 
serge-sans-paille.
Herald added a subscriber: hiraditya.
Herald added a project: All.
varunkumare99 requested review of this revision.
Herald added subscribers: llvm-commits, cfe-commits, MaskRay.
Herald added projects: clang, LLVM.

adds stack probing instruction sequence for dynamic stack allocations, VLA's 
and constant arrays to protect against stack clash attacks.

Depending on the size of the stack allocation, various probing sequences are 
generated:

- straight line sequence of subtracts and stores
- A loop allocating and probing one page size per iteration, plus a single 
probe to deal with the remainder.
- A loop which moves the SP down to the target value held in a register, used 
when the allocation size is not known at compile-time

reference: https://reviews.llvm.org/D96004


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D154911

Files:
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/lib/Driver/ToolChains/Clang.cpp
  llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
  llvm/lib/Target/ARM/ARMFrameLowering.cpp
  llvm/lib/Target/ARM/ARMFrameLowering.h
  llvm/lib/Target/ARM/ARMISelLowering.cpp
  llvm/lib/Target/ARM/ARMISelLowering.h
  llvm/lib/Target/ARM/ARMInstrInfo.td
  llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
  llvm/lib/Target/ARM/Thumb1FrameLowering.h
  llvm/test/CodeGen/ARM/stackProbing_arm.ll
  llvm/test/CodeGen/ARM/stackProbing_thumb.ll
  llvm/test/CodeGen/Thumb2/stackProbing_thumb2.ll

Index: llvm/test/CodeGen/Thumb2/stackProbing_thumb2.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/Thumb2/stackProbing_thumb2.ll
@@ -0,0 +1,188 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple thumbv8.1m.main-none-linux-eabi < %s -verify-machineinstrs | FileCheck %s
+; Function Attrs: noinline nounwind optnone
+define dso_local void @large_stack() "probe-stack"="inline-asm" "frame-pointer"="none" {
+; CHECK-LABEL: large_stack:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5}
+; CHECK-NEXT:    push {r4, r5}
+; CHECK-NEXT:    sub.w r0, sp, #79872
+; CHECK-NEXT:    subs r0, #132
+; CHECK-NEXT:    .pad #2180
+; CHECK-NEXT:    subw sp, sp, #2180
+; CHECK-NEXT:  .LBB0_1: @ %entry
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    .pad #4096
+; CHECK-NEXT:    sub.w sp, sp, #4096
+; CHECK-NEXT:    cmp sp, r0
+; CHECK-NEXT:    str.w r0, [sp, #1024]
+; CHECK-NEXT:    bne .LBB0_1
+; CHECK-NEXT:  @ %bb.2: @ %entry
+; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:    add r1, sp, #4
+; CHECK-NEXT:    str r0, [sp]
+; CHECK-NEXT:    movw r0, #19999
+; CHECK-NEXT:  .LBB0_3: @ %for.cond
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr r2, [sp]
+; CHECK-NEXT:    cmp r2, r0
+; CHECK-NEXT:    bhi .LBB0_5
+; CHECK-NEXT:  @ %bb.4: @ %for.body
+; CHECK-NEXT:    @ in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    ldr r2, [sp]
+; CHECK-NEXT:    ldr r3, [sp]
+; CHECK-NEXT:    str.w r2, [r1, r3, lsl #2]
+; CHECK-NEXT:    ldr r2, [sp]
+; CHECK-NEXT:    adds r2, #1
+; CHECK-NEXT:    str r2, [sp]
+; CHECK-NEXT:    b .LBB0_3
+; CHECK-NEXT:  .LBB0_5: @ %for.end
+; CHECK-NEXT:    add.w sp, sp, #79872
+; CHECK-NEXT:    add sp, #132
+; CHECK-NEXT:    pop {r4, r5}
+; CHECK-NEXT:    bx lr
+entry:
+  %stack = alloca [20000 x i32], align 4
+  %i = alloca i32, align 4
+  store volatile i32 0, ptr %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load volatile i32, ptr %i, align 4
+  %cmp = icmp ult i32 %0, 20000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load volatile i32, ptr %i, align 4
+  %2 = load volatile i32, ptr %i, align 4
+  %arrayidx = getelementptr inbounds [20000 x i32], ptr %stack, i32 0, i32 %2
+  store volatile i32 %1, ptr %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %3 = load volatile i32, ptr %i, align 4
+  %inc = add nsw i32 %3, 1
+  store volatile i32 %inc, ptr %i, align 4
+  br label %for.cond, !llvm.loop !6
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: noinline nounwind optnone
+define dso_local void @vla(i32 noundef %n) "probe-stack"="inline-asm" "frame-pointer"="none" {
+; CHECK-LABEL: vla:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r6, r7, lr}
+; CHECK-NEXT:    .setfp r7, sp, #8
+; CHECK-NEXT:    add r7, sp, #8
+; CHECK-NEXT:    .pad #16
+; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    movs r1, #7
+; CHECK-NEXT:    str r0, [r7, #-12]
+; CHECK-NEXT:    add.w r1, r1, r0, lsl #2
+; CHECK-NEXT:    str sp, [r7, #-16]
+; CHECK-NEXT:    bic r1, r1, #7
+; CHECK-NEXT:    sub.w r1, sp, r1
+; CHECK-NEXT:  .LBB1_1: @ %entry
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    .pad #4096
+; CHECK-NEXT:    sub.w sp, sp, #4096
+; CHECK-NEXT:    cmp sp, r1
+; CHECK-NEXT:    ble .LBB1_3
+; CHECK-NEXT:  @ %bb.2: @ %entry
+; CHECK-NEXT:    @ in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT:    str.w r1, [sp, #1024]
+; CHECK-NEXT:    b .LBB1_1
+; CHECK-NEXT:  .LBB1_3: @ %entry
+; CHECK-NEXT:    mov sp, r1
+; CHECK-NEXT:    movs r2, #0
+; CHECK-NEXT:    str.w r1, [sp, #1024]
+; CHECK-NEXT:    strb r2, [r1]
+; CHECK-NEXT:    str r0, [r7, #-20]
+; CHECK-NEXT:    ldr sp, [r7, #-16]
+; CHECK-NEXT:    sub.w r4, r7, #8
+; CHECK-NEXT:    mov sp, r4
+; CHECK-NEXT:    pop {r4, r6, r7, pc}
+entry:
+  %n.addr = alloca i32, align 4
+  %saved_stack = alloca ptr, align 4
+  %__vla_expr0 = alloca i32, align 4
+  store i32 %n, ptr %n.addr, align 4
+  %0 = load i32, ptr %n.addr, align 4
+  %1 = call ptr @llvm.stacksave()
+  store ptr %1, ptr %saved_stack, align 4
+  %vla = alloca i32, i32 %0, align 4
+  store i32 %0, ptr %__vla_expr0, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %vla, i32 0
+  call void @llvm.memset.p0.i32(ptr align 4 %arrayidx, i8 0, i32 1, i1 false)
+  %2 = load ptr, ptr %saved_stack, align 4
+  call void @llvm.stackrestore(ptr %2)
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare ptr @llvm.stacksave() #1
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
+declare void @llvm.memset.p0.i32(ptr nocapture writeonly, i8, i32, i1 immarg) #2
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare void @llvm.stackrestore(ptr) #1
+
+; Function Attrs: noinline nounwind optnone
+define dso_local void @builtin_alloca(i32 noundef %n) "probe-stack"="inline-asm" "frame-pointer"="none" {
+; CHECK-LABEL: builtin_alloca:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r6, r7, lr}
+; CHECK-NEXT:    .setfp r7, sp, #8
+; CHECK-NEXT:    add r7, sp, #8
+; CHECK-NEXT:    .pad #8
+; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    str r0, [r7, #-12]
+; CHECK-NEXT:    adds r0, #7
+; CHECK-NEXT:    bic r0, r0, #7
+; CHECK-NEXT:    sub.w r0, sp, r0
+; CHECK-NEXT:  .LBB2_1: @ %entry
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    .pad #4096
+; CHECK-NEXT:    sub.w sp, sp, #4096
+; CHECK-NEXT:    cmp sp, r0
+; CHECK-NEXT:    ble .LBB2_3
+; CHECK-NEXT:  @ %bb.2: @ %entry
+; CHECK-NEXT:    @ in Loop: Header=BB2_1 Depth=1
+; CHECK-NEXT:    str.w r0, [sp, #1024]
+; CHECK-NEXT:    b .LBB2_1
+; CHECK-NEXT:  .LBB2_3: @ %entry
+; CHECK-NEXT:    mov sp, r0
+; CHECK-NEXT:    sub.w r4, r7, #8
+; CHECK-NEXT:    str.w r0, [sp, #1024]
+; CHECK-NEXT:    str r0, [r7, #-16]
+; CHECK-NEXT:    mov sp, r4
+; CHECK-NEXT:    pop {r4, r6, r7, pc}
+entry:
+  %n.addr = alloca i32, align 4
+  %mem = alloca ptr, align 4
+  store i32 %n, ptr %n.addr, align 4
+  %0 = load i32, ptr %n.addr, align 4
+  %1 = alloca i8, i32 %0, align 8
+  store ptr %1, ptr %mem, align 4
+  ret void
+}
+
+attributes #1 = { nocallback nofree nosync nounwind willreturn }
+attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) }
+
+!llvm.module.flags = !{!0, !1, !2, !3, !4}
+!llvm.ident = !{!5}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, !"min_enum_size", i32 4}
+!2 = !{i32 8, !"PIC Level", i32 2}
+!3 = !{i32 7, !"PIE Level", i32 2}
+!4 = !{i32 7, !"frame-pointer", i32 2}
+!5 = !{!"clang version 17.0.0 (https://github.com/llvm/llvm-project a1677bda7975a0f690292587a04b9e053aacd1dc)"}
+!6 = distinct !{!6, !7}
+!7 = !{!"llvm.loop.mustprogress"}
Index: llvm/test/CodeGen/ARM/stackProbing_thumb.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/ARM/stackProbing_thumb.ll
@@ -0,0 +1,216 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=thumb-eabi -mcpu=cortex-m0 -o - | FileCheck %s
+
+; Function Attrs: noinline nounwind optnone
+define dso_local void @large_stack() "probe-stack"="inline-asm" "frame-pointer"="none" {
+; CHECK-LABEL: large_stack:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    mov r0, sp
+; CHECK-NEXT:    ldr r0, .LCPI0_1
+; CHECK-NEXT:    subs r0, r0, r0
+; CHECK-NEXT:    ldr r1, .LCPI0_2
+; CHECK-NEXT:    add sp, r1
+; CHECK-NEXT:  .LBB0_1: @ %entry
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr r1, .LCPI0_2
+; CHECK-NEXT:    add sp, r1
+; CHECK-NEXT:    str r0, [sp, #1024]
+; CHECK-NEXT:    cmp sp, r0
+; CHECK-NEXT:    bne .LBB0_1
+; CHECK-NEXT:  @ %bb.2: @ %entry
+; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:    str r0, [sp]
+; CHECK-NEXT:    ldr r0, .LCPI0_0
+; CHECK-NEXT:  .LBB0_3: @ %for.cond
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr r1, [sp]
+; CHECK-NEXT:    cmp r1, r0
+; CHECK-NEXT:    bhi .LBB0_5
+; CHECK-NEXT:  @ %bb.4: @ %for.body
+; CHECK-NEXT:    @ in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    ldr r1, [sp]
+; CHECK-NEXT:    ldr r2, [sp]
+; CHECK-NEXT:    lsls r2, r2, #2
+; CHECK-NEXT:    add r3, sp, #4
+; CHECK-NEXT:    str r1, [r3, r2]
+; CHECK-NEXT:    ldr r1, [sp]
+; CHECK-NEXT:    adds r1, r1, #1
+; CHECK-NEXT:    str r1, [sp]
+; CHECK-NEXT:    b .LBB0_3
+; CHECK-NEXT:  .LBB0_5: @ %for.end
+; CHECK-NEXT:    ldr r6, .LCPI0_1
+; CHECK-NEXT:    add sp, r6
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  @ %bb.6:
+; CHECK-NEXT:  .LCPI0_0:
+; CHECK-NEXT:    .long 19999 @ 0x4e1f
+; CHECK-NEXT:  .LCPI0_1:
+; CHECK-NEXT:    .long 80004 @ 0x13884
+; CHECK-NEXT:  .LCPI0_2:
+; CHECK-NEXT:    .long 4294963200 @ 0xfffff000
+entry:
+  %stack = alloca [20000 x i32], align 4
+  %i = alloca i32, align 4
+  store volatile i32 0, ptr %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load volatile i32, ptr %i, align 4
+  %cmp = icmp ult i32 %0, 20000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load volatile i32, ptr %i, align 4
+  %2 = load volatile i32, ptr %i, align 4
+  %arrayidx = getelementptr inbounds [20000 x i32], ptr %stack, i32 0, i32 %2
+  store volatile i32 %1, ptr %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %3 = load volatile i32, ptr %i, align 4
+  %inc = add nsw i32 %3, 1
+  store volatile i32 %inc, ptr %i, align 4
+  br label %for.cond, !llvm.loop !4
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: noinline nounwind optnone
+define dso_local void @vla(i32 noundef %n) "probe-stack"="inline-asm" "frame-pointer"="none"{
+; CHECK-LABEL: vla:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r6, r7, lr}
+; CHECK-NEXT:    .setfp r7, sp, #8
+; CHECK-NEXT:    add r7, sp, #8
+; CHECK-NEXT:    ldr r1, .LCPI1_0
+; CHECK-NEXT:    add sp, r1
+; CHECK-NEXT:    mov r6, sp
+; CHECK-NEXT:    adds r1, r6, #4
+; CHECK-NEXT:    str r0, [r1, #8]
+; CHECK-NEXT:    mov r2, sp
+; CHECK-NEXT:    str r2, [r1, #4]
+; CHECK-NEXT:    lsls r2, r0, #2
+; CHECK-NEXT:    adds r2, r2, #7
+; CHECK-NEXT:    movs r3, #7
+; CHECK-NEXT:    bics r2, r3
+; CHECK-NEXT:    mov r3, sp
+; CHECK-NEXT:    subs r2, r3, r2
+; CHECK-NEXT:  .LBB1_1: @ %entry
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr r6, .LCPI1_0
+; CHECK-NEXT:    add sp, r6
+; CHECK-NEXT:    cmp sp, r2
+; CHECK-NEXT:    ble .LBB1_3
+; CHECK-NEXT:  @ %bb.2: @ %entry
+; CHECK-NEXT:    @ in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT:    str r2, [sp, #1024]
+; CHECK-NEXT:    b .LBB1_1
+; CHECK-NEXT:  .LBB1_3: @ %entry
+; CHECK-NEXT:    mov sp, r2
+; CHECK-NEXT:    str r2, [sp, #1024]
+; CHECK-NEXT:    movs r3, #0
+; CHECK-NEXT:    strb r3, [r2]
+; CHECK-NEXT:    str r0, [r1]
+; CHECK-NEXT:    ldr r0, [r1, #4]
+; CHECK-NEXT:    mov sp, r0
+; CHECK-NEXT:    subs r4, r7, #7
+; CHECK-NEXT:    subs r4, #1
+; CHECK-NEXT:    mov sp, r4
+; CHECK-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  @ %bb.4:
+; CHECK-NEXT:  .LCPI1_0:
+; CHECK-NEXT:    .long 4294963200 @ 0xfffff000
+entry:
+  %n.addr = alloca i32, align 4
+  %saved_stack = alloca ptr, align 4
+  %__vla_expr0 = alloca i32, align 4
+  store i32 %n, ptr %n.addr, align 4
+  %0 = load i32, ptr %n.addr, align 4
+  %1 = call ptr @llvm.stacksave()
+  store ptr %1, ptr %saved_stack, align 4
+  %vla = alloca i32, i32 %0, align 4
+  store i32 %0, ptr %__vla_expr0, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %vla, i32 0
+  call void @llvm.memset.p0.i32(ptr align 4 %arrayidx, i8 0, i32 1, i1 false)
+  %2 = load ptr, ptr %saved_stack, align 4
+  call void @llvm.stackrestore(ptr %2)
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare ptr @llvm.stacksave() #1
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
+declare void @llvm.memset.p0.i32(ptr nocapture writeonly, i8, i32, i1 immarg) #2
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare void @llvm.stackrestore(ptr) #1
+
+; Function Attrs: noinline nounwind optnone
+define dso_local void @builtin_alloca(i32 noundef %n) "probe-stack"="inline-asm" "frame-pointer"="none"{
+; CHECK-LABEL: builtin_alloca:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r6, r7, lr}
+; CHECK-NEXT:    .setfp r7, sp, #8
+; CHECK-NEXT:    add r7, sp, #8
+; CHECK-NEXT:    ldr r1, .LCPI2_0
+; CHECK-NEXT:    add sp, r1
+; CHECK-NEXT:    mov r6, sp
+; CHECK-NEXT:    mov r1, r6
+; CHECK-NEXT:    str r0, [r1, #4]
+; CHECK-NEXT:    adds r0, r0, #7
+; CHECK-NEXT:    movs r2, #7
+; CHECK-NEXT:    bics r0, r2
+; CHECK-NEXT:    mov r2, sp
+; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:  .LBB2_1: @ %entry
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr r6, .LCPI2_0
+; CHECK-NEXT:    add sp, r6
+; CHECK-NEXT:    cmp sp, r0
+; CHECK-NEXT:    ble .LBB2_3
+; CHECK-NEXT:  @ %bb.2: @ %entry
+; CHECK-NEXT:    @ in Loop: Header=BB2_1 Depth=1
+; CHECK-NEXT:    str r0, [sp, #1024]
+; CHECK-NEXT:    b .LBB2_1
+; CHECK-NEXT:  .LBB2_3: @ %entry
+; CHECK-NEXT:    mov sp, r0
+; CHECK-NEXT:    str r0, [sp, #1024]
+; CHECK-NEXT:    str r0, [r1]
+; CHECK-NEXT:    subs r4, r7, #7
+; CHECK-NEXT:    subs r4, #1
+; CHECK-NEXT:    mov sp, r4
+; CHECK-NEXT:    pop {r4, r6, r7, pc}
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  @ %bb.4:
+; CHECK-NEXT:  .LCPI2_0:
+; CHECK-NEXT:    .long 4294963200 @ 0xfffff000
+entry:
+  %n.addr = alloca i32, align 4
+  %mem = alloca ptr, align 4
+  store i32 %n, ptr %n.addr, align 4
+  %0 = load i32, ptr %n.addr, align 4
+  %1 = alloca i8, i32 %0, align 8
+  store ptr %1, ptr %mem, align 4
+  ret void
+}
+
+attributes #1 = { nocallback nofree nosync nounwind willreturn }
+attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, !"min_enum_size", i32 4}
+!2 = !{i32 7, !"frame-pointer", i32 2}
+!3 = !{!"clang version 17.0.0 (https://github.com/llvm/llvm-project a1677bda7975a0f690292587a04b9e053aacd1dc)"}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.mustprogress"}
Index: llvm/test/CodeGen/ARM/stackProbing_arm.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/ARM/stackProbing_arm.ll
@@ -0,0 +1,191 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple arm-eabi-linux < %s -verify-machineinstrs | FileCheck %s
+
+; Function Attrs: noinline nounwind optnone
+define dso_local void @large_stack() "probe-stack"="inline-asm" "frame-pointer"="none"{
+; CHECK-LABEL: large_stack:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5}
+; CHECK-NEXT:    push {r4, r5}
+; CHECK-NEXT:    sub r0, sp, #132
+; CHECK-NEXT:    sub r0, r0, #79872
+; CHECK-NEXT:    .pad #132
+; CHECK-NEXT:    sub sp, sp, #132
+; CHECK-NEXT:    .pad #2048
+; CHECK-NEXT:    sub sp, sp, #2048
+; CHECK-NEXT:  .LBB0_1: @ %entry
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    .pad #4096
+; CHECK-NEXT:    sub sp, sp, #4096
+; CHECK-NEXT:    cmp sp, r0
+; CHECK-NEXT:    str r0, [sp, #1024]
+; CHECK-NEXT:    bne .LBB0_1
+; CHECK-NEXT:  @ %bb.2: @ %entry
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    add r1, sp, #4
+; CHECK-NEXT:    str r0, [sp]
+; CHECK-NEXT:    mov r0, #31
+; CHECK-NEXT:    orr r0, r0, #19968
+; CHECK-NEXT:  .LBB0_3: @ %for.cond
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr r2, [sp]
+; CHECK-NEXT:    cmp r2, r0
+; CHECK-NEXT:    bhi .LBB0_5
+; CHECK-NEXT:  @ %bb.4: @ %for.body
+; CHECK-NEXT:    @ in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    ldr r2, [sp]
+; CHECK-NEXT:    ldr r3, [sp]
+; CHECK-NEXT:    str r2, [r1, r3, lsl #2]
+; CHECK-NEXT:    ldr r2, [sp]
+; CHECK-NEXT:    add r2, r2, #1
+; CHECK-NEXT:    str r2, [sp]
+; CHECK-NEXT:    b .LBB0_3
+; CHECK-NEXT:  .LBB0_5: @ %for.end
+; CHECK-NEXT:    add sp, sp, #132
+; CHECK-NEXT:    add sp, sp, #79872
+; CHECK-NEXT:    pop {r4, r5}
+; CHECK-NEXT:    mov pc, lr
+entry:
+  %stack = alloca [20000 x i32], align 4
+  %i = alloca i32, align 4
+  store volatile i32 0, ptr %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load volatile i32, ptr %i, align 4
+  %cmp = icmp ult i32 %0, 20000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load volatile i32, ptr %i, align 4
+  %2 = load volatile i32, ptr %i, align 4
+  %arrayidx = getelementptr inbounds [20000 x i32], ptr %stack, i32 0, i32 %2
+  store volatile i32 %1, ptr %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %3 = load volatile i32, ptr %i, align 4
+  %inc = add nsw i32 %3, 1
+  store volatile i32 %inc, ptr %i, align 4
+  br label %for.cond, !llvm.loop !6
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: noinline nounwind optnone
+define dso_local void @vla(i32 noundef %n) "probe-stack"="inline-asm" "frame-pointer"="none"{
+; CHECK-LABEL: vla:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r11}
+; CHECK-NEXT:    push {r4, r5, r11}
+; CHECK-NEXT:    .setfp r11, sp, #8
+; CHECK-NEXT:    add r11, sp, #8
+; CHECK-NEXT:    .pad #12
+; CHECK-NEXT:    sub sp, sp, #12
+; CHECK-NEXT:    mov r1, #7
+; CHECK-NEXT:    mov r2, sp
+; CHECK-NEXT:    add r1, r1, r0, lsl #2
+; CHECK-NEXT:    str r0, [r11, #-12]
+; CHECK-NEXT:    bic r1, r1, #7
+; CHECK-NEXT:    str sp, [r11, #-16]
+; CHECK-NEXT:    sub r3, r2, r1
+; CHECK-NEXT:  .LBB1_1: @ %entry
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #4096
+; CHECK-NEXT:    cmp sp, r3
+; CHECK-NEXT:    ble .LBB1_3
+; CHECK-NEXT:  @ %bb.2: @ %entry
+; CHECK-NEXT:    @ in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT:    str r3, [sp, #1024]
+; CHECK-NEXT:    b .LBB1_1
+; CHECK-NEXT:  .LBB1_3: @ %entry
+; CHECK-NEXT:    mov sp, r3
+; CHECK-NEXT:    str r3, [sp, #1024]
+; CHECK-NEXT:    mov r3, #0
+; CHECK-NEXT:    strb r3, [r2, -r1]
+; CHECK-NEXT:    str r0, [r11, #-20]
+; CHECK-NEXT:    ldr sp, [r11, #-16]
+; CHECK-NEXT:    sub sp, r11, #8
+; CHECK-NEXT:    pop {r4, r5, r11}
+; CHECK-NEXT:    mov pc, lr
+entry:
+  %n.addr = alloca i32, align 4
+  %saved_stack = alloca ptr, align 4
+  %__vla_expr0 = alloca i32, align 4
+  store i32 %n, ptr %n.addr, align 4
+  %0 = load i32, ptr %n.addr, align 4
+  %1 = call ptr @llvm.stacksave()
+  store ptr %1, ptr %saved_stack, align 4
+  %vla = alloca i32, i32 %0, align 4
+  store i32 %0, ptr %__vla_expr0, align 4
+  %arrayidx = getelementptr inbounds i32, ptr %vla, i32 0
+  call void @llvm.memset.p0.i32(ptr align 4 %arrayidx, i8 0, i32 1, i1 false)
+  %2 = load ptr, ptr %saved_stack, align 4
+  call void @llvm.stackrestore(ptr %2)
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare ptr @llvm.stacksave() #1
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
+declare void @llvm.memset.p0.i32(ptr nocapture writeonly, i8, i32, i1 immarg) #2
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare void @llvm.stackrestore(ptr) #1
+
+; Function Attrs: noinline nounwind optnone
+define dso_local void @builtin_alloca(i32 noundef %n)  "probe-stack"="inline-asm" "frame-pointer"="none"{
+; CHECK-LABEL: builtin_alloca:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r11}
+; CHECK-NEXT:    push {r4, r5, r11}
+; CHECK-NEXT:    .setfp r11, sp, #8
+; CHECK-NEXT:    add r11, sp, #8
+; CHECK-NEXT:    .pad #12
+; CHECK-NEXT:    sub sp, sp, #12
+; CHECK-NEXT:    str r0, [r11, #-12]
+; CHECK-NEXT:    add r0, r0, #7
+; CHECK-NEXT:    bic r0, r0, #7
+; CHECK-NEXT:    sub r0, sp, r0
+; CHECK-NEXT:  .LBB2_1: @ %entry
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #4096
+; CHECK-NEXT:    cmp sp, r0
+; CHECK-NEXT:    ble .LBB2_3
+; CHECK-NEXT:  @ %bb.2: @ %entry
+; CHECK-NEXT:    @ in Loop: Header=BB2_1 Depth=1
+; CHECK-NEXT:    str r0, [sp, #1024]
+; CHECK-NEXT:    b .LBB2_1
+; CHECK-NEXT:  .LBB2_3: @ %entry
+; CHECK-NEXT:    mov sp, r0
+; CHECK-NEXT:    str r0, [sp, #1024]
+; CHECK-NEXT:    str r0, [r11, #-16]
+; CHECK-NEXT:    sub sp, r11, #8
+; CHECK-NEXT:    pop {r4, r5, r11}
+; CHECK-NEXT:    mov pc, lr
+entry:
+  %n.addr = alloca i32, align 4
+  %mem = alloca ptr, align 4
+  store i32 %n, ptr %n.addr, align 4
+  %0 = load i32, ptr %n.addr, align 4
+  %1 = alloca i8, i32 %0, align 8
+  store ptr %1, ptr %mem, align 4
+  ret void
+}
+
+attributes #1 = { nocallback nofree nosync nounwind willreturn }
+attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) }
+
+!llvm.module.flags = !{!0, !1, !2, !3, !4}
+!llvm.ident = !{!5}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, !"min_enum_size", i32 4}
+!2 = !{i32 8, !"PIC Level", i32 2}
+!3 = !{i32 7, !"PIE Level", i32 2}
+!4 = !{i32 7, !"frame-pointer", i32 2}
+!5 = !{!"clang version 17.0.0 (https://github.com/llvm/llvm-project a1677bda7975a0f690292587a04b9e053aacd1dc)"}
+!6 = distinct !{!6, !7}
+!7 = !{!"llvm.loop.mustprogress"}
Index: llvm/lib/Target/ARM/Thumb1FrameLowering.h
===================================================================
--- llvm/lib/Target/ARM/Thumb1FrameLowering.h
+++ llvm/lib/Target/ARM/Thumb1FrameLowering.h
@@ -42,6 +42,16 @@
                                 MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator MI) const override;
 
+  /// Replace a StackProbe stub (if any) with the actual probe code inline
+  void inlineStackProbe(MachineFunction &MF,
+                        MachineBasicBlock &PrologueMBB) const override;
+  MachineBasicBlock::iterator
+  inlineStackProbeFixed(MachineFunction &MF,
+                        MachineBasicBlock::iterator MBBI) const;
+  MachineBasicBlock::iterator
+  inlineStackProbeVar(MachineFunction &MF,
+                      MachineBasicBlock::iterator MBBI) const;
+
   /// Check whether or not the given \p MBB can be used as a epilogue
   /// for the target.
   /// The epilogue will be inserted before the first terminator of that block.
@@ -54,6 +64,10 @@
     return false;
   }
 
+  MachineBasicBlock::iterator
+  insertStackProbingLoop(MachineBasicBlock::iterator MBBI,
+                         Register TargetReg) const override;
+
 private:
   /// Check if the frame lowering of \p MF needs a special fixup
   /// code sequence for the epilogue.
Index: llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
===================================================================
--- llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -63,6 +63,28 @@
   return !MFI.hasVarSizedObjects();
 }
 
+static unsigned
+findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB,
+                                 const ThumbRegisterInfo *RegInfo) {
+  MachineFunction *MF = MBB->getParent();
+
+  LivePhysRegs LiveRegs(*RegInfo);
+  LiveRegs.addLiveIns(*MBB);
+
+  // Mark callee saved registers as used so we will not choose them.
+  const MCPhysReg *CSRegs = MF->getRegInfo().getCalleeSavedRegs();
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    LiveRegs.addReg(CSRegs[i]);
+
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
+  const TargetRegisterClass &RC = ARM::GPRRegClass;
+  for (unsigned Reg : RC) {
+    if (LiveRegs.available(MRI, Reg))
+      return Reg;
+  }
+  return ARM::NoRegister;
+}
+
 static void
 emitPrologueEpilogueSPUpdate(MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator &MBBI,
@@ -157,7 +179,7 @@
       *static_cast<const Thumb1InstrInfo *>(STI.getInstrInfo());
 
   unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize();
-  unsigned NumBytes = MFI.getStackSize();
+  int NumBytes = MFI.getStackSize();
   assert(NumBytes >= ArgRegsSaveSize &&
          "ArgRegsSaveSize is included in NumBytes");
   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
@@ -414,21 +436,37 @@
   }
 
   if (NumBytes) {
-    // Insert it after all the callee-save spills.
-    //
-    // For a large stack frame, we might need a scratch register to store
-    // the size of the frame.  We know all callee-save registers are free
-    // at this point in the prologue, so pick one.
-    unsigned ScratchRegister = ARM::NoRegister;
-    for (auto &I : CSI) {
-      Register Reg = I.getReg();
-      if (isARMLowRegister(Reg) && !(HasFP && Reg == FramePtr)) {
-        ScratchRegister = Reg;
-        break;
+    const ARMTargetLowering *TLI =
+        MF.getSubtarget<ARMSubtarget>().getTargetLowering();
+    bool NeedsStackProbe =
+        TLI->hasInlineStackProbe(MF) &&
+        (NumBytes >= TLI->getStackProbeMaxUnprobedStack(MF) ||
+         MFI.hasVarSizedObjects());
+    bool NeedsRelignment = RegInfo->hasStackRealignment(MF);
+    if (NeedsStackProbe && !NeedsRelignment) {
+      Register ScratchReg = findScratchNonCalleeSaveRegister(&MBB, RegInfo);
+      assert(ScratchReg != ARM::NoRegister);
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::PROBED_STACKALLOC))
+          .addDef(ScratchReg)
+          .addImm(-NumBytes);
+    } else {
+      // Insert it after all the callee-save spills.
+      //
+      // For a large stack frame, we might need a scratch register to store
+      // the size of the frame.  We know all callee-save registers are free
+      // at this point in the prologue, so pick one.
+      unsigned ScratchRegister = ARM::NoRegister;
+      for (auto &I : CSI) {
+        Register Reg = I.getReg();
+        if (isARMLowRegister(Reg) && !(HasFP && Reg == FramePtr)) {
+          ScratchRegister = Reg;
+          break;
+        }
       }
+      emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes,
+                                   ScratchRegister, MachineInstr::FrameSetup);
     }
-    emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes,
-                                 ScratchRegister, MachineInstr::FrameSetup);
+
     if (!HasFP) {
       CFAOffset += NumBytes;
       unsigned CFIIndex = MF.addFrameInst(
@@ -449,6 +487,11 @@
 
   if (RegInfo->hasStackRealignment(MF)) {
     const unsigned NrBitsToZero = Log2(MFI.getMaxAlign());
+    const ARMTargetLowering *TLI =
+        MF.getSubtarget<ARMSubtarget>().getTargetLowering();
+    bool NeedsStackProbe = TLI->hasInlineStackProbe(MF) &&
+                           (NumBytes + MFI.getMaxAlign().value()) >=
+                               TLI->getStackProbeMaxUnprobedStack(MF);
     // Emit the following sequence, using R4 as a temporary, since we cannot use
     // SP as a source or destination register for the shifts:
     // mov  r4, sp
@@ -471,9 +514,14 @@
       .addImm(NrBitsToZero)
       .add(predOps(ARMCC::AL));
 
-    BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP)
-      .addReg(ARM::R4, RegState::Kill)
-      .add(predOps(ARMCC::AL));
+    if (NeedsStackProbe) {
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::PROBED_STACKALLOC_VAR))
+          .addUse(ARM::R4);
+    } else {
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP)
+          .addReg(ARM::R4, RegState::Kill)
+          .add(predOps(ARMCC::AL));
+    }
 
     AFI->setShouldRestoreSPFromFP(true);
   }
@@ -1202,3 +1250,254 @@
 
   return true;
 }
+
+/// Emit a loop to decrement SP until it is equal to TargetReg, with probes at
+///  least every NegProbeSize bytes. Returns an iterator of the first
+///  instruction after the loop. The difference between SP and TargetReg must be
+///  an exact multiple of NegProbeSize.
+
+static MachineBasicBlock::iterator inlineStackProbeLoopExactMultiple(
+    MachineFunction &MF, MachineBasicBlock::iterator MBBI, int64_t NegProbeSize,
+    Register TargetReg, const TargetInstrInfo &TII,
+    const ThumbRegisterInfo *RegInfo) {
+  MachineBasicBlock &MBB = *MBBI->getParent();
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+  MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
+  MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.insert(MBBInsertPoint, LoopMBB);
+  MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.insert(MBBInsertPoint, ExitMBB);
+
+  // ADD SP, SP, #NegFrameSize (or equivalent if NegFrameSize is not encodable
+  // in ADD).
+  auto loopMBBend = LoopMBB->end();
+  emitCallSPUpdate(*LoopMBB, loopMBBend, TII, DL, *RegInfo, NegProbeSize);
+
+  // STR TargetReg, [SP, #StackClashCallerGuard]
+  BuildMI(*LoopMBB, LoopMBB->end(), DL, TII.get(ARM::tSTRi))
+      .addReg(TargetReg)
+      .addReg(ARM::SP)
+      .addImm(ARM::StackClashCallerGuard / 4)
+      .add(predOps(ARMCC::AL));
+
+  // CMP SP, TargetReg
+  BuildMI(*LoopMBB, LoopMBB->end(), DL, TII.get(ARM::tCMPr))
+      .addReg(ARM::SP)
+      .addReg(TargetReg)
+      .add(predOps(ARMCC::AL));
+
+  // B.CC Loop
+  BuildMI(*LoopMBB, LoopMBB->end(), DL, TII.get(ARM::tBcc))
+      .addMBB(LoopMBB)
+      .addImm(ARMCC::NE)
+      .addReg(ARM::CPSR);
+
+  LoopMBB->addSuccessor(ExitMBB);
+  LoopMBB->addSuccessor(LoopMBB);
+  // Synthesize the exit MBB.
+  ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
+  ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+  MBB.addSuccessor(LoopMBB);
+  // Update liveins.
+  recomputeLiveIns(*LoopMBB);
+  recomputeLiveIns(*ExitMBB);
+
+  return ExitMBB->begin();
+}
+
+MachineBasicBlock::iterator Thumb1FrameLowering::inlineStackProbeFixed(
+    MachineFunction &MF, MachineBasicBlock::iterator MBBI) const {
+  MachineBasicBlock &MBB = *MBBI->getParent();
+  const ARMTargetLowering *TLI =
+      MF.getSubtarget<ARMSubtarget>().getTargetLowering();
+  const Thumb1InstrInfo &TII =
+      *static_cast<const Thumb1InstrInfo *>(STI.getInstrInfo());
+  const ThumbRegisterInfo *RegInfo =
+      static_cast<const ThumbRegisterInfo *>(STI.getRegisterInfo());
+
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+  Register ScratchReg = MBBI->getOperand(0).getReg();
+  int64_t NegFrameSize = MBBI->getOperand(1).getImm();
+  int64_t NegProbeSize = -(int64_t)TLI->getStackProbeSize(MF);
+  int64_t NumBlocks = NegFrameSize / NegProbeSize;
+  int64_t NegResidualSize = NegFrameSize % NegProbeSize;
+  bool NeedResidualProbe =
+      NegResidualSize <= -(int64_t)TLI->getStackProbeMaxUnprobedStack(MF);
+  bool UnrollProbeLoop = NumBlocks <= ARM::StackClashCallerMaxUnrollPage;
+
+  MachineBasicBlock::iterator NextInst;
+  if (UnrollProbeLoop) {
+    for (int i = 0; i < NumBlocks; ++i) {
+      emitCallSPUpdate(MBB, MBBI, TII, DL, *RegInfo, NegProbeSize);
+      // STR ScratchReg, [SP, #StackClashCallerGuard]
+      BuildMI(MBB, MBBI, DL, TII.get(ARM::tSTRi))
+          .addReg(ScratchReg)
+          .addReg(ARM::SP)
+          .addImm(ARM::StackClashCallerGuard / 4)
+          .add(predOps(ARMCC::AL));
+    }
+    NextInst = std::next(MBBI);
+  } else if (NumBlocks != 0) {
+    // ADD ScratchReg, SP, #NegFrameSize (or equivalent if NegFrameSize is not
+    // encodable in ADD).
+    // ADD ScratchReg, SP, 0
+    // ADD ScratchReg, NegFrameSize
+
+    emitThumbRegPlusImmediate(MBB, MBBI, DL, ScratchReg, ARM::SP, 0, TII,
+                              *RegInfo, MachineInstr::NoFlags);
+    emitThumbRegPlusImmediate(MBB, MBBI, DL, ScratchReg, ScratchReg,
+                              NegFrameSize, TII, *RegInfo,
+                              MachineInstr::NoFlags);
+
+    NextInst = inlineStackProbeLoopExactMultiple(MF, MBBI, NegProbeSize,
+                                                 ScratchReg, TII, RegInfo);
+  }
+
+  if (NegResidualSize != 0) {
+    // ADD SP, SP, #NegFrameSize (or equivalent if NegFrameSize is not encodable
+    // in ADD).
+    emitCallSPUpdate(MBB, MBBI, TII, DL, *RegInfo, NegProbeSize);
+    if (NeedResidualProbe) {
+      // STR ScratchReg, [SP, #StackClashCallerGuard]
+      BuildMI(MBB, MBBI, DL, TII.get(ARM::tSTRi))
+          .addReg(ScratchReg)
+          .addReg(ARM::SP)
+          .addImm(ARM::StackClashCallerGuard / 4)
+          .add(predOps(ARMCC::AL));
+    }
+  }
+
+  MBBI->eraseFromParent();
+  return NextInst;
+}
+
+MachineBasicBlock::iterator
+Thumb1FrameLowering::insertStackProbingLoop(MachineBasicBlock::iterator MBBI,
+                                            Register TargetReg) const {
+  MachineBasicBlock &MBB = *MBBI->getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const ARMTargetLowering *TLI =
+      MF.getSubtarget<ARMSubtarget>().getTargetLowering();
+  const Thumb1InstrInfo &TII =
+      *static_cast<const Thumb1InstrInfo *>(STI.getInstrInfo());
+  const ThumbRegisterInfo *RegInfo =
+      static_cast<const ThumbRegisterInfo *>(STI.getRegisterInfo());
+
+  int64_t NegProbeSize = -(int64_t)TLI->getStackProbeSize(MF);
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+  MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
+  MachineBasicBlock *LoopTestMBB =
+      MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.insert(MBBInsertPoint, LoopTestMBB);
+  MachineBasicBlock *LoopBodyMBB =
+      MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.insert(MBBInsertPoint, LoopBodyMBB);
+  MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.insert(MBBInsertPoint, ExitMBB);
+
+  auto loopTestMBBItr = LoopTestMBB->end();
+  unsigned ScratchRegister = ARM::NoRegister;
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  bool HasFP = hasFP(MF);
+  Register FramePtr = RegInfo->getFrameRegister(MF);
+  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+  for (auto &I : CSI) {
+    Register Reg = I.getReg();
+    if (isARMLowRegister(Reg) && !(HasFP && Reg == FramePtr)) {
+      ScratchRegister = Reg;
+      break;
+    }
+  }
+
+  // LoopTest:
+  //   SUB SP, SP, #ProbeSize
+  emitPrologueEpilogueSPUpdate(*LoopTestMBB, loopTestMBBItr, TII, DL, *RegInfo,
+                               NegProbeSize, ScratchRegister,
+                               MachineInstr::NoFlags);
+
+  //   CMP SP, TargetReg
+  BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII.get(ARM::tCMPr))
+      .addReg(ARM::SP)
+      .addReg(TargetReg)
+      .add(predOps(ARMCC::AL));
+
+  //   B.LE LoopExit
+  BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII.get(ARM::tBcc))
+      .addMBB(ExitMBB)
+      .addImm(ARMCC::LE)
+      .addReg(ARM::CPSR);
+
+  //   STR TargetReg, [SP, #StackClashCallerGuard]
+  BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII.get(ARM::tSTRi))
+      .addReg(TargetReg)
+      .addReg(ARM::SP)
+      .addImm(ARM::StackClashCallerGuard / 4)
+      .add(predOps(ARMCC::AL));
+
+  //   B loop
+  BuildMI(LoopBodyMBB, DL, TII.get(ARM::tB))
+      .addMBB(LoopTestMBB)
+      .add(predOps(ARMCC::AL));
+
+  // LoopExit:
+  //   MOV SP, TargetReg
+  BuildMI(*ExitMBB, ExitMBB->end(), DL, TII.get(ARM::tMOVr), ARM::SP)
+      .addReg(TargetReg)
+      .add(predOps(ARMCC::AL));
+
+  //   STR TargetReg, [SP, #StackClashCallerGuard]
+  BuildMI(*ExitMBB, ExitMBB->end(), DL, TII.get(ARM::tSTRi))
+      .addReg(TargetReg)
+      .addReg(ARM::SP)
+      .addImm(ARM::StackClashCallerGuard / 4)
+      .add(predOps(ARMCC::AL));
+
+  LoopTestMBB->addSuccessor(ExitMBB);
+  LoopTestMBB->addSuccessor(LoopBodyMBB);
+  LoopBodyMBB->addSuccessor(LoopTestMBB);
+
+  // Synthesize the exit MBB.
+  ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
+  ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+  MBB.addSuccessor(LoopTestMBB);
+
+  // Update liveins.
+  if (MF.getRegInfo().reservedRegsFrozen()) {
+    recomputeLiveIns(*LoopTestMBB);
+    recomputeLiveIns(*LoopBodyMBB);
+    recomputeLiveIns(*ExitMBB);
+  }
+
+  return ExitMBB->begin();
+}
+
+MachineBasicBlock::iterator Thumb1FrameLowering::inlineStackProbeVar(
+    MachineFunction &MF, MachineBasicBlock::iterator MBBI) const {
+  MachineBasicBlock &MBB = *MBBI->getParent();
+
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+  Register TargetReg = MBBI->getOperand(0).getReg();
+  MachineBasicBlock::iterator NextInst = std::next(MBBI);
+
+  NextInst = insertStackProbingLoop(MBBI, TargetReg);
+
+  MBBI->eraseFromParent();
+  return NextInst;
+}
+
+void Thumb1FrameLowering::inlineStackProbe(MachineFunction &MF,
+                                           MachineBasicBlock &MBB) const {
+  for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E;) {
+    if (MBBI->getOpcode() == ARM::PROBED_STACKALLOC) {
+      MBBI = inlineStackProbeFixed(MF, MBBI);
+      E = MBBI->getParent()->end();
+    } else if (MBBI->getOpcode() == ARM::PROBED_STACKALLOC_VAR) {
+      MBBI = inlineStackProbeVar(MF, MBBI);
+      E = MBBI->getParent()->end();
+    } else {
+      ++MBBI;
+    }
+  }
+}
Index: llvm/lib/Target/ARM/ARMInstrInfo.td
===================================================================
--- llvm/lib/Target/ARM/ARMInstrInfo.td
+++ llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -2169,6 +2169,10 @@
                         i32imm:$size), NoItinerary, []>;
 
 
+def ARMprobedalloca
+    : SDNode<"ARMISD::PROBED_ALLOCA",
+             SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
+             [SDNPHasChain]>;
 // FIXME: Marking these as hasSideEffects is necessary to prevent machine DCE
 // from removing one half of the matched pairs. That breaks PEI, which assumes
 // these will always be in pairs, and asserts if it finds otherwise. Better way?
@@ -2180,6 +2184,33 @@
 def ADJCALLSTACKDOWN :
 PseudoInst<(outs), (ins i32imm:$amt, i32imm:$amt2, pred:$p), NoItinerary,
            [(ARMcallseq_start timm:$amt, timm:$amt2)]>;
+
+// Probed stack allocation of a constant size, used in function prologues when
+// stack-clash protection is enabled.
+def PROBED_STACKALLOC : PseudoInst<(outs GPR:$scratch),
+                               (ins i32imm:$stacksize),
+                               NoItinerary,
+                               []>,
+                               Sched<[]>;
+
+// Probed stack allocation of a variable size, used in function prologues when
+// stack-clash protection is enabled. The register input is the target SP,
+// which should be below the current value, and has no alignment requirements
+// beyond the usual 16-byte alignment for SP.
+def PROBED_STACKALLOC_VAR : PseudoInst<(outs),
+                                   (ins GPR:$target),
+                                   NoItinerary,
+                                   []>,
+                                   Sched<[]>;
+
+// Probed stack allocations of a variable size, used for allocas of unknown size
+// when stack-clash protection is enabled.
+def PROBED_STACKALLOC_DYN
+    : PseudoInst<(outs),
+             (ins GPR:$target),
+             NoItinerary,
+             [(ARMprobedalloca GPR:$target)]>,
+      Sched<[]>;
 }
 
 def HINT : AI<(outs), (ins imm0_239:$imm), MiscFrm, NoItinerary,
Index: llvm/lib/Target/ARM/ARMISelLowering.h
===================================================================
--- llvm/lib/Target/ARM/ARMISelLowering.h
+++ llvm/lib/Target/ARM/ARMISelLowering.h
@@ -112,6 +112,10 @@
     SUBE, // Sub using carry
     LSLS, // Shift left producing carry
 
+    // Dynamic stack allocation with stack clash protection, allocation is done
+    // by blocks and each block is probed with a zero store.
+    PROBED_ALLOCA,
+
     VMOVRRD, // double to two gprs.
     VMOVDRR, // Two gprs to double.
     VMOVSR,  // move gpr to single, used for f32 literal constructed in a gpr
@@ -754,6 +758,17 @@
         ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
         Value *Accumulator = nullptr) const override;
 
+    /// True if stack clash protection is enabled for this functions.
+    bool hasInlineStackProbe(const MachineFunction &MF) const override;
+
+    /// Get the interval between stack-clash probes, which is equal to the stack
+    /// guard size, in bytes.
+    unsigned getStackProbeSize(MachineFunction &MF) const;
+
+    /// Get the maximum allowed number of unprobed bytes above SP at an ABI
+    /// boundary.
+    unsigned getStackProbeMaxUnprobedStack(MachineFunction &MF) const;
+
   protected:
     std::pair<const TargetRegisterClass *, uint8_t>
     findRepresentativeClass(const TargetRegisterInfo *TRI,
@@ -851,6 +866,8 @@
                                    SDValue &Chain) const;
     SDValue LowerREM(SDNode *N, SelectionDAG &DAG) const;
     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerInlineDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
@@ -993,7 +1010,12 @@
 
     FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
                              const TargetLibraryInfo *libInfo);
-
+    // The amount of bytes a caller is allowed update the stack before emit a
+    // probe required by stack clash protection.
+    static constexpr unsigned StackClashCallerGuard = 1024;
+    // How many time to unroll the loop for large outgoing argument during stack
+    // clash protection probing.
+    static constexpr unsigned StackClashCallerMaxUnrollPage = 4;
   } // end namespace ARM
 
 } // end namespace llvm
Index: llvm/lib/Target/ARM/ARMISelLowering.cpp
===================================================================
--- llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1311,10 +1311,7 @@
   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 
-  if (Subtarget->isTargetWindows())
-    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
-  else
-    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
 
   // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
   // the default expansion.
@@ -1729,6 +1726,7 @@
     MAKE_CASE(ARMISD::TC_RETURN)
     MAKE_CASE(ARMISD::THREAD_POINTER)
     MAKE_CASE(ARMISD::DYN_ALLOC)
+    MAKE_CASE(ARMISD::PROBED_ALLOCA)
     MAKE_CASE(ARMISD::MEMBARRIER_MCR)
     MAKE_CASE(ARMISD::PRELOAD)
     MAKE_CASE(ARMISD::LDRD)
@@ -10585,9 +10583,7 @@
   case ISD::SDIVREM:
   case ISD::UDIVREM:       return LowerDivRem(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC:
-    if (Subtarget->isTargetWindows())
-      return LowerDYNAMIC_STACKALLOC(Op, DAG);
-    llvm_unreachable("Don't know how to custom lower this!");
+    return LowerDYNAMIC_STACKALLOC(Op, DAG);
   case ISD::STRICT_FP_ROUND:
   case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
   case ISD::STRICT_FP_EXTEND:
@@ -20745,7 +20741,7 @@
 }
 
 SDValue
-ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
+ARMTargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
   assert(Subtarget->isTargetWindows() && "unsupported target platform");
   SDLoc DL(Op);
 
@@ -22129,3 +22125,82 @@
 
   return nullptr;
 }
+
+bool ARMTargetLowering::hasInlineStackProbe(const MachineFunction &MF) const {
+  // If the function specifically requests inline stack probes, emit them.
+  if (MF.getFunction().hasFnAttribute("probe-stack")) {
+    if (MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
+        "inline-asm")
+      return true;
+    else
+      llvm_unreachable("Unsupported stack probing method");
+  }
+
+  return false;
+}
+
+unsigned ARMTargetLowering::getStackProbeSize(MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = Subtarget->getFrameLowering();
+  unsigned StackAlign = TFI->getStackAlignment();
+  assert(StackAlign >= 1 && isPowerOf2_32(StackAlign) &&
+         "Unexpected stack alignment");
+  // The default stack probe size is 4096 if the function has no
+  // stack-probe-size attribute. This is a safe default because it is the
+  // smallest possible guard page size.
+  unsigned StackProbeSize = 4096;
+  const Function &Fn = MF.getFunction();
+  if (Fn.hasFnAttribute("stack-probe-size"))
+    Fn.getFnAttribute("stack-probe-size")
+        .getValueAsString()
+        .getAsInteger(0, StackProbeSize);
+  // Round down to the stack alignment.
+  StackProbeSize &= ~(StackAlign - 1);
+  return StackProbeSize ? StackProbeSize : StackAlign;
+}
+
+unsigned
+ARMTargetLowering::getStackProbeMaxUnprobedStack(MachineFunction &MF) const {
+  // Since the ABI requires save FP/LR or just LR for leaf functions, it acts as
+  // an implict stack probe. Probing at StackClashCallerGuard means that the
+  // rest of the guard page (assumed to be stack-probe-size attribute) can be
+  // used without further probing.
+  return getStackProbeSize(MF) - ARM::StackClashCallerGuard;
+}
+
+SDValue
+ARMTargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  // Get the inputs.
+  SDNode *Node = Op.getNode();
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size = Op.getOperand(1);
+  MaybeAlign Align =
+      cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
+  EVT VT = Node->getValueType(0);
+
+  // Construct the new SP value in a GPR.
+  SDValue SP = DAG.getCopyFromReg(Chain, dl, ARM::SP, MVT::i32);
+  Chain = SP.getValue(1);
+  SP = DAG.getNode(ISD::SUB, dl, MVT::i32, SP, Size);
+  if (Align)
+    SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
+                     DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
+
+  // Set the real SP to the new value with a probing loop.
+  Chain = DAG.getNode(ARMISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP);
+  SDValue Ops[2] = {SP, Chain};
+  return DAG.getMergeValues(Ops, dl);
+}
+
+SDValue ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  if (Subtarget->isTargetWindows())
+    return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
+  if (hasInlineStackProbe(MF)) {
+    return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
+  } else
+    return SDValue();
+}
Index: llvm/lib/Target/ARM/ARMFrameLowering.h
===================================================================
--- llvm/lib/Target/ARM/ARMFrameLowering.h
+++ llvm/lib/Target/ARM/ARMFrameLowering.h
@@ -79,6 +79,10 @@
   const SpillSlot *
   getCalleeSavedSpillSlots(unsigned &NumEntries) const override;
 
+  virtual MachineBasicBlock::iterator
+  insertStackProbingLoop(MachineBasicBlock::iterator MBBI,
+                         Register TargetReg) const ;
+
 private:
   void emitPushInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                     ArrayRef<CalleeSavedInfo> CSI, unsigned StmOpc,
@@ -94,6 +98,15 @@
   eliminateCallFramePseudoInstr(MachineFunction &MF,
                                 MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator MI) const override;
+  /// Replace a StackProbe stub (if any) with the actual probe code inline
+  void inlineStackProbe(MachineFunction &MF,
+                        MachineBasicBlock &PrologueMBB) const override;
+  MachineBasicBlock::iterator
+  inlineStackProbeFixed(MachineFunction &MF,
+                        MachineBasicBlock::iterator MBBI) const;
+  MachineBasicBlock::iterator
+  inlineStackProbeVar(MachineFunction &MF,
+                      MachineBasicBlock::iterator MBBI) const;
 };
 
 } // end namespace llvm
Index: llvm/lib/Target/ARM/ARMFrameLowering.cpp
===================================================================
--- llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -120,6 +120,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -173,6 +174,8 @@
 skipAlignedDPRCS2Spills(MachineBasicBlock::iterator MI,
                         unsigned NumAlignedDPRCS2Regs);
 
+static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB);
+
 ARMFrameLowering::ARMFrameLowering(const ARMSubtarget &sti)
     : TargetFrameLowering(StackGrowsDown, sti.getStackAlignment(), 0, Align(4)),
       STI(sti) {}
@@ -739,12 +742,13 @@
   const MCRegisterInfo *MRI = Context.getRegisterInfo();
   const ARMBaseRegisterInfo *RegInfo = STI.getRegisterInfo();
   const ARMBaseInstrInfo &TII = *STI.getInstrInfo();
+  const ARMTargetLowering &TLI = *STI.getTargetLowering();
   assert(!AFI->isThumb1OnlyFunction() &&
          "This emitPrologue does not support Thumb1!");
   bool isARM = !AFI->isThumbFunction();
   Align Alignment = STI.getFrameLowering()->getStackAlign();
   unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize();
-  unsigned NumBytes = MFI.getStackSize();
+  int NumBytes = MFI.getStackSize();
   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
   int FPCXTSaveSize = 0;
   bool NeedsWinCFI = needsWinCFI(MF);
@@ -1038,11 +1042,21 @@
   }
 
   if (NumBytes) {
+    bool NeedsStackProbe = TLI.hasInlineStackProbe(MF) &&
+                           (NumBytes >= TLI.getStackProbeMaxUnprobedStack(MF) ||
+                            MFI.hasVarSizedObjects());
+    bool NeedsRealignment = RegInfo->hasStackRealignment(MF);
     // Adjust SP after all the callee-save spills.
     if (AFI->getNumAlignedDPRCS2Regs() == 0 &&
         tryFoldSPUpdateIntoPushPop(STI, MF, &*LastPush, NumBytes))
       DefCFAOffsetCandidates.addExtraBytes(LastPush, NumBytes);
-    else {
+    else if (NeedsStackProbe && !NeedsRealignment) {
+      Register ScratchReg = findScratchNonCalleeSaveRegister(&MBB);
+      assert(ScratchReg != ARM::NoRegister);
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::PROBED_STACKALLOC))
+          .addDef(ScratchReg)
+          .addImm(-NumBytes);
+    } else {
       emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes,
                    MachineInstr::FrameSetup);
       DefCFAOffsetCandidates.addInst(std::prev(MBBI), NumBytes);
@@ -1222,8 +1236,19 @@
     Align MaxAlign = MFI.getMaxAlign();
     assert(!AFI->isThumb1OnlyFunction());
     if (!AFI->isThumbFunction()) {
-      emitAligningInstructions(MF, AFI, TII, MBB, MBBI, dl, ARM::SP, MaxAlign,
-                               false);
+      bool NeedsStackProbe = TLI.hasInlineStackProbe(MF) &&
+                             (NumBytes + MFI.getMaxAlign().value()) >=
+                                 TLI.getStackProbeMaxUnprobedStack(MF);
+      if (NeedsStackProbe) {
+        Register ScratchReg = findScratchNonCalleeSaveRegister(&MBB);
+        emitAligningInstructions(MF, AFI, TII, MBB, MBBI, dl, ScratchReg,
+                                 MaxAlign, false);
+        BuildMI(MBB, MBBI, dl, TII.get(ARM::PROBED_STACKALLOC_VAR))
+            .addUse(ScratchReg);
+      } else {
+        emitAligningInstructions(MF, AFI, TII, MBB, MBBI, dl, ARM::SP, MaxAlign,
+                                 false);
+      }
     } else {
       // We cannot use sp as source/dest register here, thus we're using r4 to
       // perform the calculations. We're emitting the following sequence:
@@ -1232,14 +1257,29 @@
       // -- out lower bits in r4
       // mov sp, r4
       // FIXME: It will be better just to find spare register here.
-      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::R4)
-          .addReg(ARM::SP, RegState::Kill)
-          .add(predOps(ARMCC::AL));
-      emitAligningInstructions(MF, AFI, TII, MBB, MBBI, dl, ARM::R4, MaxAlign,
-                               false);
-      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP)
-          .addReg(ARM::R4, RegState::Kill)
-          .add(predOps(ARMCC::AL));
+      bool NeedsStackProbe = TLI.hasInlineStackProbe(MF) &&
+                             (NumBytes + MFI.getMaxAlign().value()) >=
+                                 TLI.getStackProbeMaxUnprobedStack(MF);
+      if (NeedsStackProbe) {
+        BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVr), ARM::R4)
+            .addReg(ARM::SP, RegState::Kill)
+            .add(predOps(ARMCC::AL))
+            .add(condCodeOp());
+
+        emitAligningInstructions(MF, AFI, TII, MBB, MBBI, dl, ARM::R4, MaxAlign,
+                                 false);
+        BuildMI(MBB, MBBI, dl, TII.get(ARM::PROBED_STACKALLOC_VAR))
+            .addUse(ARM::R4);
+      } else {
+        BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::R4)
+            .addReg(ARM::SP, RegState::Kill)
+            .add(predOps(ARMCC::AL));
+        emitAligningInstructions(MF, AFI, TII, MBB, MBBI, dl, ARM::R4, MaxAlign,
+                                 false);
+        BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP)
+            .addReg(ARM::R4, RegState::Kill)
+            .add(predOps(ARMCC::AL));
+      }
     }
 
     AFI->setShouldRestoreSPFromFP(true);
@@ -3385,3 +3425,379 @@
   MF.verify();
 #endif
 }
+
+/// Emit a loop to decrement SP until it is equal to TargetReg, with probes at
+///  least every NegProbeSize bytes. Returns an iterator of the first
+///  instruction after the loop. The difference between SP and TargetReg must be
+///  an exact multiple of NegProbeSize.
+
+static MachineBasicBlock::iterator inlineStackProbeLoopExactMultiple(
+    MachineFunction &MF, MachineBasicBlock::iterator MBBI, int64_t NegProbeSize,
+    Register TargetReg, bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFI) {
+  MachineBasicBlock &MBB = *MBBI->getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  bool isARM = !AFI->isThumbFunction();
+  const ARMBaseInstrInfo &TII =
+      *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+  MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
+  MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.insert(MBBInsertPoint, LoopMBB);
+  MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.insert(MBBInsertPoint, ExitMBB);
+
+  if (isARM) {
+    // ADD SP, SP, #NegFrameSize (or equivalent if NegFrameSize is not encodable
+    // in ADD).
+    auto loopMBBend = LoopMBB->end();
+    emitSPUpdate(true, *LoopMBB, loopMBBend, DL, TII, NegProbeSize,
+                 MachineInstr::FrameSetup);
+
+    // STR TargetReg, [SP, #StackClashCallerGuard]
+    BuildMI(*LoopMBB, LoopMBB->end(), DL, TII.get(ARM::STRi12))
+        .addReg(TargetReg)
+        .addReg(ARM::SP)
+        .addImm(ARM::StackClashCallerGuard)
+        .addImm(ARMCC::AL)
+        .addImm(0);
+
+    // CMP SP, TargetReg
+    BuildMI(*LoopMBB, LoopMBB->end(), DL, TII.get(ARM::CMPrr))
+        .addReg(ARM::SP)
+        .addReg(TargetReg)
+        .add(predOps(ARMCC::AL));
+
+    // B.CC Loop
+    BuildMI(*LoopMBB, LoopMBB->end(), DL, TII.get(ARM::Bcc))
+        .addMBB(LoopMBB)
+        .addImm(ARMCC::NE)
+        .addReg(ARM::CPSR);
+  } else {
+    // ADD SP, SP, #NegFrameSize (or equivalent if NegFrameSize is not encodable
+    // in ADD).
+    auto loopMBBend = LoopMBB->end();
+    emitSPUpdate(isARM, *LoopMBB, loopMBBend, DL, TII, NegProbeSize,
+                 MachineInstr::FrameSetup);
+
+    // STR TargetReg, [SP, #StackClashCallerGuard]
+    BuildMI(*LoopMBB, LoopMBB->end(), DL, TII.get(ARM::t2STRi12))
+        .addReg(TargetReg)
+        .addReg(ARM::SP)
+        .addImm(ARM::StackClashCallerGuard)
+        .add(predOps(ARMCC::AL));
+
+    // CMP SP, TargetReg
+    BuildMI(*LoopMBB, LoopMBB->end(), DL, TII.get(ARM::t2CMPrr))
+        .addReg(ARM::SP)
+        .addReg(TargetReg)
+        .add(predOps(ARMCC::AL));
+
+    // B.CC Loop
+    BuildMI(*LoopMBB, LoopMBB->end(), DL, TII.get(ARM::t2Bcc))
+        .addMBB(LoopMBB)
+        .addImm(ARMCC::NE)
+        .addReg(ARM::CPSR);
+  }
+
+  LoopMBB->addSuccessor(ExitMBB);
+  LoopMBB->addSuccessor(LoopMBB);
+  // Synthesize the exit MBB.
+  ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
+  ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+  MBB.addSuccessor(LoopMBB);
+  // Update liveins.
+  recomputeLiveIns(*LoopMBB);
+  recomputeLiveIns(*ExitMBB);
+
+  return ExitMBB->begin();
+}
+
+MachineBasicBlock::iterator ARMFrameLowering::inlineStackProbeFixed(
+    MachineFunction &MF, MachineBasicBlock::iterator MBBI) const {
+  MachineBasicBlock &MBB = *MBBI->getParent();
+  const ARMTargetLowering *TLI =
+      MF.getSubtarget<ARMSubtarget>().getTargetLowering();
+  const ARMBaseInstrInfo &TII =
+      *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  bool isARM = !AFI->isThumbFunction();
+  bool HasFP = hasFP(MF);
+  bool NeedsWinCFI = needsWinCFI(MF);
+  bool EmitCFI = !NeedsWinCFI;
+  bool HasWinCFI = false;
+
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+  Register ScratchReg = MBBI->getOperand(0).getReg();
+  int64_t NegFrameSize = MBBI->getOperand(1).getImm();
+  int64_t NegProbeSize = -(int64_t)TLI->getStackProbeSize(MF);
+  int64_t NumBlocks = NegFrameSize / NegProbeSize;
+  int64_t NegResidualSize = NegFrameSize % NegProbeSize;
+  bool NeedResidualProbe =
+      NegResidualSize <= -(int64_t)TLI->getStackProbeMaxUnprobedStack(MF);
+  bool UnrollProbeLoop = NumBlocks <= ARM::StackClashCallerMaxUnrollPage;
+  LLVM_DEBUG(dbgs() << "Stack probing (fixed): total " << NegFrameSize
+                    << " bytes, " << NumBlocks << " blocks of " << NegProbeSize
+                    << " bytes, " << NeedResidualProbe << " block of "
+                    << NegResidualSize << " bytes"
+                    << " (residual), Unroll: " << UnrollProbeLoop << ", "
+                    << "CFI: " << (EmitCFI && !HasFP) << "\n");
+
+  MachineBasicBlock::iterator NextInst;
+  if (UnrollProbeLoop) {
+    for (int i = 0; i < NumBlocks; ++i) {
+      if (isARM) {
+        // ADD SP, SP, #NegFrameSize (or equivalent if NegFrameSize is not
+        // encodable in ADD).
+        emitSPUpdate(isARM, MBB, MBBI, DL, TII, NegProbeSize,
+                     MachineInstr::NoFlags);
+        // STR ScratchReg, [SP, #StackClashCallerGuard]
+        BuildMI(MBB, MBBI, DL, TII.get(ARM::STRi12))
+            .addReg(ScratchReg)
+            .addReg(ARM::SP)
+            .addImm(ARM::StackClashCallerGuard)
+            .addImm(ARMCC::AL)
+            .addImm(0);
+      } else {
+
+        emitSPUpdate(isARM, MBB, MBBI, DL, TII, NegProbeSize,
+                     MachineInstr::FrameSetup);
+
+        // STR ScratchReg, [SP, #StackClashCallerGuard]
+        BuildMI(MBB, MBBI, DL, TII.get(ARM::t2STRi12))
+            .addReg(ScratchReg)
+            .addReg(ARM::SP)
+            .addImm(ARM::StackClashCallerGuard)
+            .add(predOps(ARMCC::AL));
+      }
+    }
+    NextInst = std::next(MBBI);
+  } else if (NumBlocks != 0) {
+    // ADD ScratchReg, SP, #NegFrameSize (or equivalent if NegFrameSize is not
+    // encodable in ADD).
+    // TODO:
+    emitRegPlusImmediate(isARM, MBB, MBBI, DL, TII, ScratchReg, ARM::SP,
+                         NegFrameSize, MachineInstr::NoFlags, ARMCC::AL, 0);
+
+    NextInst = inlineStackProbeLoopExactMultiple(
+        MF, MBBI, NegProbeSize, ScratchReg, NeedsWinCFI, &HasWinCFI, EmitCFI);
+  }
+
+  if (NegResidualSize != 0) {
+    // ADD SP, SP, #NegFrameSize (or equivalent if NegFrameSize is not encodable
+    // in ADD).
+    if (isARM) {
+      emitSPUpdate(isARM, MBB, MBBI, DL, TII, NegResidualSize,
+                   MachineInstr::FrameSetup);
+      if (NeedResidualProbe) {
+        // STR ScratchReg, [SP, #StackClashCallerGuard]
+        BuildMI(MBB, MBBI, DL, TII.get(ARM::STRi12))
+            .addReg(ScratchReg)
+            .addReg(ARM::SP)
+            .addImm(ARM::StackClashCallerGuard)
+            .addImm(ARMCC::AL)
+            .addImm(0);
+      }
+    } else {
+      emitSPUpdate(isARM, MBB, MBBI, DL, TII, NegResidualSize,
+                   MachineInstr::FrameSetup);
+      if (NeedResidualProbe) {
+        // STR ScratchReg, [SP, #StackClashCallerGuard]
+        BuildMI(MBB, MBBI, DL, TII.get(ARM::t2STRi12))
+            .addReg(ScratchReg)
+            .addReg(ARM::SP)
+            .addImm(ARM::StackClashCallerGuard)
+            .add(predOps(ARMCC::AL));
+      }
+    }
+  }
+
+  MBBI->eraseFromParent();
+  return NextInst;
+}
+
+MachineBasicBlock::iterator
+ARMFrameLowering::inlineStackProbeVar(MachineFunction &MF,
+                                      MachineBasicBlock::iterator MBBI) const {
+  MachineBasicBlock &MBB = *MBBI->getParent();
+
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+  Register TargetReg = MBBI->getOperand(0).getReg();
+  MachineBasicBlock::iterator NextInst = std::next(MBBI);
+
+  NextInst = insertStackProbingLoop(MBBI, TargetReg);
+
+  MBBI->eraseFromParent();
+  return NextInst;
+}
+
+MachineBasicBlock::iterator
+ARMFrameLowering::insertStackProbingLoop(MachineBasicBlock::iterator MBBI,
+                                         Register TargetReg) const {
+  MachineBasicBlock &MBB = *MBBI->getParent();
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  bool isARM = !AFI->isThumbFunction();
+  const ARMTargetLowering *TLI =
+      MF.getSubtarget<ARMSubtarget>().getTargetLowering();
+  const ARMBaseInstrInfo &TII =
+      *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+  int64_t NegProbeSize = -(int64_t)TLI->getStackProbeSize(MF);
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+  MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
+  MachineBasicBlock *LoopTestMBB =
+      MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.insert(MBBInsertPoint, LoopTestMBB);
+  MachineBasicBlock *LoopBodyMBB =
+      MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.insert(MBBInsertPoint, LoopBodyMBB);
+  MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.insert(MBBInsertPoint, ExitMBB);
+
+  // LoopTest:
+  //   SUB SP, SP, #ProbeSize
+  if (isARM) {
+    MachineBasicBlock::iterator LoopTestMBBItr = LoopTestMBB->end();
+    emitSPUpdate(isARM, *LoopTestMBB, LoopTestMBBItr, DL, TII, NegProbeSize,
+                 MachineInstr::NoFlags);
+
+    //   CMP SP, TargetReg
+    BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII.get(ARM::CMPrr))
+        .addReg(ARM::SP)
+        .addReg(TargetReg)
+        .add(predOps(ARMCC::AL));
+
+    //   B.LE LoopExit
+    BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII.get(ARM::Bcc))
+        .addMBB(ExitMBB)
+        .addImm(ARMCC::LE)
+        .addReg(ARM::CPSR);
+
+    //   STR TargetReg, [SP, #StackClashCallerGuard]
+    BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII.get(ARM::STRi12))
+        .addReg(TargetReg)
+        .addReg(ARM::SP)
+        .addImm(ARM::StackClashCallerGuard)
+        .addImm(ARMCC::AL)
+        .addImm(0);
+
+    //   B loop
+    BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII.get(ARM::B))
+        .addMBB(LoopTestMBB);
+
+    // LoopExit:
+    //   MOV SP, TargetReg
+    BuildMI(*ExitMBB, ExitMBB->end(), DL, TII.get(ARM::MOVr), ARM::SP)
+        .addReg(TargetReg)
+        .add(predOps(ARMCC::AL))
+        .add(condCodeOp());
+
+    //   STR TargetReg, [SP, #StackClashCallerGuard]
+    BuildMI(*ExitMBB, ExitMBB->end(), DL, TII.get(ARM::STRi12))
+        .addReg(TargetReg)
+        .addReg(ARM::SP)
+        .addImm(ARM::StackClashCallerGuard)
+        .addImm(ARMCC::AL)
+        .addImm(0);
+  } else {
+    MachineBasicBlock::iterator LoopTestMBBItr = LoopTestMBB->end();
+    emitSPUpdate(isARM, *LoopTestMBB, LoopTestMBBItr, DL, TII, NegProbeSize,
+                 MachineInstr::FrameSetup);
+
+    //   CMP SP, TargetReg
+    BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII.get(ARM::t2CMPrr))
+        .addReg(ARM::SP)
+        .addReg(TargetReg)
+        .add(predOps(ARMCC::AL));
+
+    //   B.LE LoopExit
+    BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII.get(ARM::t2Bcc))
+        .addMBB(ExitMBB)
+        .addImm(ARMCC::LE)
+        .addReg(ARM::CPSR);
+
+    //   STR TargetReg, [SP, #StackClashCallerGuard]
+    BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII.get(ARM::t2STRi12))
+        .addReg(TargetReg)
+        .addReg(ARM::SP)
+        .addImm(ARM::StackClashCallerGuard)
+        .add(predOps(ARMCC::AL));
+
+    //   B loop
+    BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII.get(ARM::t2B))
+        .addMBB(LoopTestMBB)
+        .add(predOps(ARMCC::AL));
+
+    // LoopExit:
+    //   MOV SP, TargetReg
+    BuildMI(*ExitMBB, ExitMBB->end(), DL, TII.get(ARM::t2MOVr), ARM::SP)
+        .addReg(TargetReg)
+        .add(predOps(ARMCC::AL))
+        .add(condCodeOp());
+
+    //   STR TargetReg, [SP, #StackClashCallerGuard]
+    BuildMI(*ExitMBB, ExitMBB->end(), DL, TII.get(ARM::t2STRi12))
+        .addReg(TargetReg)
+        .addReg(ARM::SP)
+        .addImm(ARM::StackClashCallerGuard)
+        .add(predOps(ARMCC::AL));
+  }
+
+  LoopTestMBB->addSuccessor(ExitMBB);
+  LoopTestMBB->addSuccessor(LoopBodyMBB);
+  LoopBodyMBB->addSuccessor(LoopTestMBB);
+
+  // Synthesize the exit MBB.
+  ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
+  ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+  MBB.addSuccessor(LoopTestMBB);
+
+  // Update liveins.
+  if (MF.getRegInfo().reservedRegsFrozen()) {
+    recomputeLiveIns(*LoopTestMBB);
+    recomputeLiveIns(*LoopBodyMBB);
+    recomputeLiveIns(*ExitMBB);
+  }
+
+  return ExitMBB->begin();
+}
+
+void ARMFrameLowering::inlineStackProbe(MachineFunction &MF,
+                                        MachineBasicBlock &MBB) const {
+  for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E;) {
+    if (MBBI->getOpcode() == ARM::PROBED_STACKALLOC) {
+      MBBI = inlineStackProbeFixed(MF, MBBI);
+      E = MBBI->getParent()->end();
+    } else if (MBBI->getOpcode() == ARM::PROBED_STACKALLOC_VAR) {
+      MBBI = inlineStackProbeVar(MF, MBBI);
+      E = MBBI->getParent()->end();
+    } else {
+      ++MBBI;
+    }
+  }
+}
+
+static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
+  MachineFunction *MF = MBB->getParent();
+
+  const ARMSubtarget &Subtarget = MF->getSubtarget<ARMSubtarget>();
+  const ARMBaseRegisterInfo TRI = *Subtarget.getRegisterInfo();
+  LivePhysRegs LiveRegs(TRI);
+  LiveRegs.addLiveIns(*MBB);
+
+  // Mark callee saved registers as used so we will not choose them.
+  const MCPhysReg *CSRegs = MF->getRegInfo().getCalleeSavedRegs();
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    LiveRegs.addReg(CSRegs[i]);
+
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
+  const TargetRegisterClass &RC = ARM::GPRRegClass;
+  for (unsigned Reg : RC) {
+    if (LiveRegs.available(MRI, Reg))
+      return Reg;
+  }
+  return ARM::NoRegister;
+}
Index: llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
===================================================================
--- llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -2040,6 +2040,15 @@
   }
 }
 
+static void EmitDynamicProbedAlloc(MachineInstr &MI, MachineBasicBlock *MBB) {
+  Register TargetReg = MI.getOperand(0).getReg();
+  MachineFunction *MF = MBB->getParent();
+  const ARMFrameLowering *TFI =
+      MF->getSubtarget<ARMSubtarget>().getFrameLowering();
+  TFI->insertStackProbingLoop(MI, TargetReg);
+  MI.eraseFromParent();
+}
+
 bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator MBBI,
                                MachineBasicBlock::iterator &NextMBBI) {
@@ -2048,7 +2057,11 @@
   switch (Opcode) {
     default:
       return false;
-
+    case ARM::PROBED_STACKALLOC_DYN: {
+      EmitDynamicProbedAlloc(MI, &MBB);
+      NextMBBI = MBB.end();
+      return true;
+    }
     case ARM::VBSPd:
     case ARM::VBSPq: {
       Register DstReg = MI.getOperand(0).getReg();
Index: clang/lib/Driver/ToolChains/Clang.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Clang.cpp
+++ clang/lib/Driver/ToolChains/Clang.cpp
@@ -3447,11 +3447,18 @@
     return;
 
   if (!EffectiveTriple.isX86() && !EffectiveTriple.isSystemZ() &&
-      !EffectiveTriple.isPPC64())
+      !EffectiveTriple.isPPC64() && !EffectiveTriple.isARM() &&
+      !EffectiveTriple.isThumb())
     return;
 
   Args.addOptInFlag(CmdArgs, options::OPT_fstack_clash_protection,
                     options::OPT_fno_stack_clash_protection);
+  if (Args.hasArg(options::OPT_mstack_probe_size)) {
+    StringRef Size = Args.getLastArgValue(options::OPT_mstack_probe_size);
+    CmdArgs.push_back(Args.MakeArgString("-mstack-probe-size=" + Size));
+  } else if (EffectiveTriple.isArm() || EffectiveTriple.isThumb()) {
+    CmdArgs.push_back("-mstack-probe-size=1024");
+  }
 }
 
 static void RenderTrivialAutoVarInitOptions(const Driver &D,
Index: clang/lib/CodeGen/CodeGenModule.cpp
===================================================================
--- clang/lib/CodeGen/CodeGenModule.cpp
+++ clang/lib/CodeGen/CodeGenModule.cpp
@@ -2257,8 +2257,12 @@
   if ((!D || !D->hasAttr<NoUwtableAttr>()) && CodeGenOpts.UnwindTables)
     B.addUWTableAttr(llvm::UWTableKind(CodeGenOpts.UnwindTables));
 
-  if (CodeGenOpts.StackClashProtector)
+  if (CodeGenOpts.StackClashProtector) {
     B.addAttribute("probe-stack", "inline-asm");
+    if (CodeGenOpts.StackProbeSize != 4096)
+      B.addAttribute("stack-probe-size",
+                     llvm::utostr(CodeGenOpts.StackProbeSize));
+  }
 
   if (!hasUnwindExceptions(LangOpts))
     B.addAttribute(llvm::Attribute::NoUnwind);
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
  • [PATCH] D154911: Enabling fs... Varun Kumar E via Phabricator via cfe-commits

Reply via email to