[PATCH] D108138: [WIP] Remove switch statements before vectorization

Kerry McLaughlin via Phabricator via cfe-commits Wed, 15 Sep 2021 08:21:04 -0700

kmclaughlin updated this revision to Diff 372706.
kmclaughlin retitled this revision from "[SimplifyCFG] Remove switch statements 
before vectorization" to "[WIP] Remove switch statements before vectorization".
kmclaughlin edited the summary of this revision.
kmclaughlin added a comment.
Herald added subscribers: kerbowa, nhaehnle, jvesely.


- Removed changes to SimplifyCFG and instead run LowerSwitch before 
vectorisation.
- Added `SimpleSwitchConvert` to LowerSwitch which is used if the pass is run 
before vectorisation - this only considers simple switches (where each 
destination block is unique) which are also part of a loop.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D108138/new/

https://reviews.llvm.org/D108138

Files:
  clang/test/Frontend/optimization-remark-analysis.c
  llvm/include/llvm/Transforms/Utils/LowerSwitch.h
  llvm/lib/Passes/PassBuilder.cpp
  llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
  llvm/lib/Transforms/Utils/FixIrreducible.cpp
  llvm/lib/Transforms/Utils/LowerSwitch.cpp
  llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
  llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
  llvm/test/Other/new-pm-defaults.ll
  llvm/test/Other/new-pm-lto-defaults.ll
  llvm/test/Other/new-pm-thinlto-defaults.ll
  llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
  llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
  llvm/test/Transforms/LoopVectorize/AArch64/sve-remove-switches.ll
  llvm/test/Transforms/LoopVectorize/remove-switches.ll
  llvm/test/Transforms/LowerSwitch/simple-switches.ll
  llvm/test/Transforms/StructurizeCFG/workarounds/needs-fr-ule.ll

Index: llvm/test/Transforms/StructurizeCFG/workarounds/needs-fr-ule.ll
===================================================================
--- llvm/test/Transforms/StructurizeCFG/workarounds/needs-fr-ule.ll
+++ llvm/test/Transforms/StructurizeCFG/workarounds/needs-fr-ule.ll
@@ -13,32 +13,32 @@
 ; CHECK-NEXT:    [[PRED11_INV:%.*]] = xor i1 [[PRED11:%.*]], true
 ; CHECK-NEXT:    [[PRED12_INV:%.*]] = xor i1 [[PRED12:%.*]], true
 ; CHECK-NEXT:    [[PRED13_INV:%.*]] = xor i1 [[PRED13:%.*]], true
-; CHECK-NEXT:    br i1 [[PRED0_INV]], label [[IF_THEN:%.*]], label [[FLOW19:%.*]]
-; CHECK:       Flow19:
+; CHECK-NEXT:    br i1 [[PRED0_INV]], label [[IF_THEN:%.*]], label [[FLOW18:%.*]]
+; CHECK:       Flow18:
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i1 [ false, [[FLOW3:%.*]] ], [ true, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br i1 [[TMP0]], label [[IF_END:%.*]], label [[FLOW20:%.*]]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[IF_END:%.*]], label [[FLOW19:%.*]]
 ; CHECK:       if.end:
-; CHECK-NEXT:    br i1 [[PRED1_INV]], label [[IF_ELSE:%.*]], label [[FLOW18:%.*]]
-; CHECK:       Flow18:
+; CHECK-NEXT:    br i1 [[PRED1_INV]], label [[IF_ELSE:%.*]], label [[FLOW17:%.*]]
+; CHECK:       Flow17:
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ false, [[IF_ELSE]] ], [ true, [[IF_END]] ]
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[IF_THEN7:%.*]], label [[IF_END16:%.*]]
 ; CHECK:       if.then7:
 ; CHECK-NEXT:    br label [[IF_END16]]
 ; CHECK:       if.else:
-; CHECK-NEXT:    br label [[FLOW18]]
-; CHECK:       Flow20:
+; CHECK-NEXT:    br label [[FLOW17]]
+; CHECK:       Flow19:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       if.end16:
-; CHECK-NEXT:    br i1 [[PRED2_INV]], label [[IF_THEN39:%.*]], label [[FLOW16:%.*]]
-; CHECK:       Flow16:
+; CHECK-NEXT:    br i1 [[PRED2_INV]], label [[IF_THEN39:%.*]], label [[FLOW15:%.*]]
+; CHECK:       Flow15:
 ; CHECK-NEXT:    [[TMP2:%.*]] = phi i1 [ false, [[FLOW5:%.*]] ], [ true, [[IF_END16]] ]
-; CHECK-NEXT:    br i1 [[TMP2]], label [[WHILE_COND_PREHEADER:%.*]], label [[FLOW17:%.*]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[WHILE_COND_PREHEADER:%.*]], label [[FLOW16:%.*]]
 ; CHECK:       while.cond.preheader:
 ; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
-; CHECK:       Flow17:
-; CHECK-NEXT:    br label [[FLOW20]]
+; CHECK:       Flow16:
+; CHECK-NEXT:    br label [[FLOW19]]
 ; CHECK:       while.cond:
-; CHECK-NEXT:    br i1 [[PRED3_INV]], label [[LOR_RHS:%.*]], label [[FLOW12:%.*]]
+; CHECK-NEXT:    br i1 [[PRED3_INV]], label [[LOR_RHS:%.*]], label [[FLOW11:%.*]]
 ; CHECK:       Flow7:
 ; CHECK-NEXT:    [[TMP3:%.*]] = phi i1 [ [[PRED7:%.*]], [[COND_END61:%.*]] ], [ false, [[IRR_GUARD:%.*]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi i1 [ false, [[COND_END61]] ], [ true, [[IRR_GUARD]] ]
@@ -46,30 +46,30 @@
 ; CHECK:       cond.true49:
 ; CHECK-NEXT:    br label [[FLOW8]]
 ; CHECK:       Flow8:
-; CHECK-NEXT:    [[TMP5:%.*]] = phi i1 [ false, [[COND_TRUE49]] ], [ true, [[FLOW7:%.*]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = phi i1 [ [[PRED4_INV]], [[COND_TRUE49]] ], [ [[TMP3]], [[FLOW7]] ]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[WHILE_BODY63:%.*]], label [[FLOW9:%.*]]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi i1 [ true, [[COND_TRUE49]] ], [ false, [[FLOW7:%.*]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi i1 [ false, [[COND_TRUE49]] ], [ true, [[FLOW7]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = phi i1 [ [[PRED4_INV]], [[COND_TRUE49]] ], [ [[TMP3]], [[FLOW7]] ]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[WHILE_BODY63:%.*]], label [[FLOW9:%.*]]
 ; CHECK:       while.body63:
 ; CHECK-NEXT:    br i1 [[PRED5_INV]], label [[WHILE_COND47:%.*]], label [[FLOW10:%.*]]
 ; CHECK:       Flow9:
-; CHECK-NEXT:    [[TMP7:%.*]] = phi i1 [ true, [[FLOW10]] ], [ false, [[FLOW8]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = phi i1 [ false, [[FLOW10]] ], [ [[TMP5]], [[FLOW8]] ]
-; CHECK-NEXT:    [[TMP9:%.*]] = phi i1 [ [[TMP15:%.*]], [[FLOW10]] ], [ true, [[FLOW8]] ]
-; CHECK-NEXT:    [[DOTINV11:%.*]] = xor i1 [[TMP7]], true
-; CHECK-NEXT:    [[DOTINV:%.*]] = xor i1 [[TMP8]], true
-; CHECK-NEXT:    br i1 [[TMP9]], label [[LOOP_EXIT_GUARD1:%.*]], label [[IRR_GUARD]]
+; CHECK-NEXT:    [[TMP9:%.*]] = phi i1 [ false, [[FLOW10]] ], [ [[TMP6]], [[FLOW8]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = phi i1 [ [[TMP16:%.*]], [[FLOW10]] ], [ true, [[FLOW8]] ]
+; CHECK-NEXT:    [[DOTINV:%.*]] = xor i1 [[TMP9]], true
+; CHECK-NEXT:    br i1 [[TMP10]], label [[LOOP_EXIT_GUARD1:%.*]], label [[IRR_GUARD]]
 ; CHECK:       while.cond47:
 ; CHECK-NEXT:    br label [[FLOW10]]
 ; CHECK:       cond.end61:
 ; CHECK-NEXT:    br label [[FLOW7]]
-; CHECK:       Flow14:
-; CHECK-NEXT:    [[TMP10:%.*]] = phi i1 [ false, [[FLOW15:%.*]] ], [ true, [[LOOP_EXIT_GUARD1]] ]
-; CHECK-NEXT:    [[TMP11:%.*]] = phi i1 [ [[TMP14:%.*]], [[FLOW15]] ], [ [[DOTINV]], [[LOOP_EXIT_GUARD1]] ]
-; CHECK-NEXT:    br label [[FLOW13:%.*]]
+; CHECK:       Flow13:
+; CHECK-NEXT:    [[TMP11:%.*]] = phi i1 [ false, [[FLOW14:%.*]] ], [ true, [[LOOP_EXIT_GUARD1]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = phi i1 [ [[TMP15:%.*]], [[FLOW14]] ], [ [[DOTINV]], [[LOOP_EXIT_GUARD1]] ]
+; CHECK-NEXT:    br label [[FLOW12:%.*]]
 ; CHECK:       if.then69:
-; CHECK-NEXT:    br label [[FLOW15]]
+; CHECK-NEXT:    br label [[FLOW14]]
 ; CHECK:       lor.rhs:
-; CHECK-NEXT:    br label [[FLOW12]]
+; CHECK-NEXT:    br label [[FLOW11]]
 ; CHECK:       while.end76:
 ; CHECK-NEXT:    br label [[FLOW6:%.*]]
 ; CHECK:       if.then39:
@@ -87,39 +87,39 @@
 ; CHECK:       Flow:
 ; CHECK-NEXT:    br label [[FLOW3]]
 ; CHECK:       Flow3:
-; CHECK-NEXT:    br label [[FLOW19]]
+; CHECK-NEXT:    br label [[FLOW18]]
 ; CHECK:       Flow4:
 ; CHECK-NEXT:    br label [[FLOW5]]
 ; CHECK:       Flow5:
-; CHECK-NEXT:    br label [[FLOW16]]
+; CHECK-NEXT:    br label [[FLOW15]]
 ; CHECK:       Flow6:
-; CHECK-NEXT:    br label [[FLOW17]]
+; CHECK-NEXT:    br label [[FLOW16]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
-; CHECK:       Flow12:
-; CHECK-NEXT:    [[TMP12:%.*]] = phi i1 [ false, [[LOR_RHS]] ], [ true, [[WHILE_COND]] ]
-; CHECK-NEXT:    [[TMP13:%.*]] = phi i1 [ [[PRED9:%.*]], [[LOR_RHS]] ], [ [[PRED3]], [[WHILE_COND]] ]
-; CHECK-NEXT:    br i1 [[TMP13]], label [[IRR_GUARD]], label [[FLOW13]]
+; CHECK:       Flow11:
+; CHECK-NEXT:    [[TMP13:%.*]] = phi i1 [ false, [[LOR_RHS]] ], [ true, [[WHILE_COND]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi i1 [ [[PRED9:%.*]], [[LOR_RHS]] ], [ [[PRED3]], [[WHILE_COND]] ]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[IRR_GUARD]], label [[FLOW12]]
 ; CHECK:       irr.guard:
-; CHECK-NEXT:    [[GUARD_COND_TRUE49:%.*]] = phi i1 [ [[PRED6:%.*]], [[FLOW9]] ], [ [[TMP12]], [[FLOW12]] ]
+; CHECK-NEXT:    [[GUARD_COND_TRUE49:%.*]] = phi i1 [ [[PRED6:%.*]], [[FLOW9]] ], [ [[TMP13]], [[FLOW11]] ]
 ; CHECK-NEXT:    [[GUARD_COND_TRUE49_INV:%.*]] = xor i1 [[GUARD_COND_TRUE49]], true
 ; CHECK-NEXT:    br i1 [[GUARD_COND_TRUE49_INV]], label [[COND_END61]], label [[FLOW7]]
-; CHECK:       Flow15:
-; CHECK-NEXT:    [[TMP14]] = phi i1 [ [[PRED8:%.*]], [[IF_THEN69:%.*]] ], [ [[DOTINV]], [[LOOP_EXIT_GUARD2:%.*]] ]
-; CHECK-NEXT:    br label [[FLOW14:%.*]]
+; CHECK:       Flow14:
+; CHECK-NEXT:    [[TMP15]] = phi i1 [ [[PRED8:%.*]], [[IF_THEN69:%.*]] ], [ [[DOTINV]], [[LOOP_EXIT_GUARD2:%.*]] ]
+; CHECK-NEXT:    br label [[FLOW13:%.*]]
 ; CHECK:       loop.exit.guard:
-; CHECK-NEXT:    br i1 [[TMP16:%.*]], label [[WHILE_END76:%.*]], label [[FLOW6]]
+; CHECK-NEXT:    br i1 [[TMP17:%.*]], label [[WHILE_END76:%.*]], label [[FLOW6]]
 ; CHECK:       Flow10:
-; CHECK-NEXT:    [[TMP15]] = phi i1 [ false, [[WHILE_COND47]] ], [ true, [[WHILE_BODY63]] ]
+; CHECK-NEXT:    [[TMP16]] = phi i1 [ false, [[WHILE_COND47]] ], [ true, [[WHILE_BODY63]] ]
 ; CHECK-NEXT:    br label [[FLOW9]]
-; CHECK:       Flow13:
-; CHECK-NEXT:    [[TMP16]] = phi i1 [ [[TMP10]], [[FLOW14]] ], [ true, [[FLOW12]] ]
-; CHECK-NEXT:    [[TMP17:%.*]] = phi i1 [ [[TMP11]], [[FLOW14]] ], [ true, [[FLOW12]] ]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[LOOP_EXIT_GUARD:%.*]], label [[WHILE_COND]]
+; CHECK:       Flow12:
+; CHECK-NEXT:    [[TMP17]] = phi i1 [ [[TMP11]], [[FLOW13]] ], [ true, [[FLOW11]] ]
+; CHECK-NEXT:    [[TMP18:%.*]] = phi i1 [ [[TMP12]], [[FLOW13]] ], [ true, [[FLOW11]] ]
+; CHECK-NEXT:    br i1 [[TMP18]], label [[LOOP_EXIT_GUARD:%.*]], label [[WHILE_COND]]
 ; CHECK:       loop.exit.guard1:
-; CHECK-NEXT:    br i1 [[DOTINV]], label [[LOOP_EXIT_GUARD2]], label [[FLOW14]]
+; CHECK-NEXT:    br i1 [[DOTINV]], label [[LOOP_EXIT_GUARD2]], label [[FLOW13]]
 ; CHECK:       loop.exit.guard2:
-; CHECK-NEXT:    br i1 [[DOTINV11]], label [[IF_THEN69]], label [[FLOW15]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[IF_THEN69]], label [[FLOW14]]
 ;
 entry:
   br i1 %Pred0, label %if.end, label %if.then
Index: llvm/test/Transforms/LowerSwitch/simple-switches.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LowerSwitch/simple-switches.ll
@@ -0,0 +1,250 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -lowerswitch -force-loop-unswitch -S | FileCheck %s
+; RUN: opt < %s -lowerswitch -force-loop-unswitch -simplifycfg -S | FileCheck --check-prefix=CHECK-SIMPLIFY-CFG %s
+
+define void @unswitch(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i64 %N){
+; CHECK-LABEL: @unswitch(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[INC:%.*]], [[L4:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[I]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    br label [[LEAFBLOCK3:%.*]]
+; CHECK:       LeafBlock3:
+; CHECK-NEXT:    [[SWITCHLEAF4:%.*]] = icmp eq i32 [[TMP0]], 3
+; CHECK-NEXT:    br i1 [[SWITCHLEAF4]], label [[L3:%.*]], label [[LEAFBLOCK1:%.*]]
+; CHECK:       LeafBlock1:
+; CHECK-NEXT:    [[SWITCHLEAF2:%.*]] = icmp eq i32 [[TMP0]], 2
+; CHECK-NEXT:    br i1 [[SWITCHLEAF2]], label [[L2:%.*]], label [[LEAFBLOCK:%.*]]
+; CHECK:       LeafBlock:
+; CHECK-NEXT:    [[SWITCHLEAF:%.*]] = icmp eq i32 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[SWITCHLEAF]], label [[L4]], label [[NEWDEFAULT:%.*]]
+; CHECK:       NewDefault:
+; CHECK-NEXT:    br label [[L1:%.*]]
+;
+; CHECK-SIMPLIFY-CFG-LABEL: @unswitch(
+; CHECK-SIMPLIFY-CFG-NEXT:  entry:
+; CHECK-SIMPLIFY-CFG-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-SIMPLIFY-CFG:       for.body:
+; CHECK-SIMPLIFY-CFG-NEXT:    [[I:%.*]] = phi i64 [ [[INC:%.*]], [[L4:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-SIMPLIFY-CFG-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[I]]
+; CHECK-SIMPLIFY-CFG-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-SIMPLIFY-CFG-NEXT:    switch i32 [[TMP0]], label [[L1:%.*]] [
+; CHECK-SIMPLIFY-CFG-NEXT:    i32 3, label [[L3:%.*]]
+; CHECK-SIMPLIFY-CFG-NEXT:    i32 2, label [[L2:%.*]]
+; CHECK-SIMPLIFY-CFG-NEXT:    i32 4, label [[L4]]
+; CHECK-SIMPLIFY-CFG-NEXT:    ]
+
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %inc, %L4 ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i
+  %0 = load i32, i32* %arrayidx
+  switch i32 %0, label %L1 [
+  i32 4, label %L4
+  i32 2, label %L2
+  i32 3, label %L3
+  ]
+
+L1:
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i
+  %1 = load i32, i32* %arrayidx5
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, %0
+  store i32 %add, i32* %arrayidx
+  br label %L2
+
+L2:
+  %2 = phi i32 [ %0, %for.body ], [ %add, %L1 ]
+  %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %i
+  %3 = load i32, i32* %arrayidx7, align 4
+  %mul9 = mul nsw i32 %3, %3
+  %add11 = add nsw i32 %2, %mul9
+  store i32 %add11, i32* %arrayidx
+  br label %L3
+
+L3:
+  %4 = phi i32 [ %0, %for.body ], [ %add11, %L2 ]
+  %arrayidx13 = getelementptr inbounds i32, i32* %c, i64 %i
+  %5 = load i32, i32* %arrayidx13
+  %mul14 = mul nsw i32 %5, %4
+  %add16 = add nsw i32 %mul14, %4
+  store i32 %add16, i32* %arrayidx
+  br label %L4
+
+L4:
+  %6 = phi i32 [ %0, %for.body ], [ %add16, %L3 ]
+  %arrayidx17 = getelementptr inbounds i32, i32* %c, i64 %i
+  %7 = load i32, i32* %arrayidx17
+  %mul19 = mul nsw i32 %7, %7
+  %add21 = add nsw i32 %6, %mul19
+  store i32 %add21, i32* %arrayidx
+  %inc = add nuw nsw i64 %i, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; This test should not replace the switch statement as multiple cases have the same destination block
+define dso_local void @switch2(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i64 %N) {
+; CHECK-LABEL: @switch2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[INC:%.*]], [[L3:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[I]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    switch i32 [[TMP0]], label [[L1:%.*]] [
+; CHECK-NEXT:    i32 4, label [[L3]]
+; CHECK-NEXT:    i32 2, label [[L2:%.*]]
+; CHECK-NEXT:    i32 3, label [[L3]]
+; CHECK-NEXT:    ]
+;
+; CHECK-SIMPLIFY-CFG-LABEL: @switch2(
+; CHECK-SIMPLIFY-CFG-NEXT:  entry:
+; CHECK-SIMPLIFY-CFG-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-SIMPLIFY-CFG:       for.body:
+; CHECK-SIMPLIFY-CFG-NEXT:    [[I:%.*]] = phi i64 [ [[INC:%.*]], [[L3:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-SIMPLIFY-CFG-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[I]]
+; CHECK-SIMPLIFY-CFG-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-SIMPLIFY-CFG-NEXT:    switch i32 [[TMP0]], label [[L1:%.*]] [
+; CHECK-SIMPLIFY-CFG-NEXT:    i32 4, label [[L3]]
+; CHECK-SIMPLIFY-CFG-NEXT:    i32 2, label [[L2:%.*]]
+; CHECK-SIMPLIFY-CFG-NEXT:    i32 3, label [[L3]]
+; CHECK-SIMPLIFY-CFG-NEXT:    ]
+
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %inc, %L3 ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i
+  %0 = load i32, i32* %arrayidx
+  switch i32 %0, label %L1 [
+  i32 4, label %L3
+  i32 2, label %L2
+  i32 3, label %L3
+  ]
+
+L1:
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i
+  %1 = load i32, i32* %arrayidx5
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, %0
+  store i32 %add, i32* %arrayidx
+  br label %L2
+
+L2:
+  %2 = phi i32 [ %0, %for.body ], [ %add, %L1 ]
+  %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %i
+  %3 = load i32, i32* %arrayidx7
+  %mul9 = mul nsw i32 %3, %3
+  %add11 = add nsw i32 %2, %mul9
+  store i32 %add11, i32* %arrayidx
+  br label %L3
+
+L3:
+  %4 = phi i32 [ %0, %for.body ], [ %0, %for.body ], [ %add11, %L2 ]
+  %arrayidx13 = getelementptr inbounds i32, i32* %c, i64 %i
+  %5 = load i32, i32* %arrayidx13
+  %mul14 = mul nsw i32 %5, %4
+  %add16 = add nsw i32 %mul14, %4
+  store i32 %add16, i32* %arrayidx
+  %inc = add nuw nsw i64 %i, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define dso_local void @unreachable(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i64 %N) {
+; CHECK-LABEL: @unreachable(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[INC:%.*]], [[L3:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[I]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    br label [[LEAFBLOCK3:%.*]]
+; CHECK:       LeafBlock3:
+; CHECK-NEXT:    [[SWITCHLEAF4:%.*]] = icmp eq i32 [[TMP0]], 3
+; CHECK-NEXT:    br i1 [[SWITCHLEAF4]], label [[L3]], label [[LEAFBLOCK1:%.*]]
+; CHECK:       LeafBlock1:
+; CHECK-NEXT:    [[SWITCHLEAF2:%.*]] = icmp eq i32 [[TMP0]], 2
+; CHECK-NEXT:    br i1 [[SWITCHLEAF2]], label [[L2:%.*]], label [[LEAFBLOCK:%.*]]
+; CHECK:       LeafBlock:
+; CHECK-NEXT:    [[SWITCHLEAF:%.*]] = icmp eq i32 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[SWITCHLEAF]], label [[L1:%.*]], label [[NEWDEFAULT:%.*]]
+; CHECK:       NewDefault:
+; CHECK-NEXT:    br label [[DEFAULT:%.*]]
+; CHECK:       Default:
+; CHECK-NEXT:    unreachable
+;
+; CHECK-SIMPLIFY-CFG-LABEL: @unreachable(
+; CHECK-SIMPLIFY-CFG-NEXT:  entry:
+; CHECK-SIMPLIFY-CFG-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-SIMPLIFY-CFG:       for.body:
+; CHECK-SIMPLIFY-CFG-NEXT:    [[I:%.*]] = phi i64 [ [[INC:%.*]], [[L3:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-SIMPLIFY-CFG-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[I]]
+; CHECK-SIMPLIFY-CFG-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-SIMPLIFY-CFG-NEXT:    switch i32 [[TMP0]], label [[DEFAULT:%.*]] [
+; CHECK-SIMPLIFY-CFG-NEXT:    i32 3, label [[L3]]
+; CHECK-SIMPLIFY-CFG-NEXT:    i32 2, label [[L2:%.*]]
+; CHECK-SIMPLIFY-CFG-NEXT:    i32 4, label [[L1:%.*]]
+; CHECK-SIMPLIFY-CFG-NEXT:    ]
+; CHECK-SIMPLIFY-CFG:       Default:
+; CHECK-SIMPLIFY-CFG-NEXT:    unreachable
+;
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %inc, %L3 ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i
+  %0 = load i32, i32* %arrayidx
+  switch i32 %0, label %Default [
+  i32 4, label %L1
+  i32 2, label %L2
+  i32 3, label %L3
+  ]
+
+Default:
+  unreachable
+
+L1:
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i
+  %1 = load i32, i32* %arrayidx5
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, %0
+  store i32 %add, i32* %arrayidx
+  br label %L2
+
+L2:
+  %2 = phi i32 [ %0, %for.body ], [ %add, %L1 ]
+  %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %i
+  %3 = load i32, i32* %arrayidx7
+  %mul9 = mul nsw i32 %3, %3
+  %add11 = add nsw i32 %2, %mul9
+  store i32 %add11, i32* %arrayidx
+  br label %L3
+
+L3:
+  %4 = phi i32 [ %0, %for.body ], [ %add11, %L2 ]
+  %arrayidx13 = getelementptr inbounds i32, i32* %c, i64 %i
+  %5 = load i32, i32* %arrayidx13
+  %mul14 = mul nsw i32 %5, %4
+  %add16 = add nsw i32 %mul14, %4
+  store i32 %add16, i32* %arrayidx
+  %inc = add nuw nsw i64 %i, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
Index: llvm/test/Transforms/LoopVectorize/remove-switches.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/remove-switches.ll
@@ -0,0 +1,467 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -O3 -loop-vectorize -pass-remarks-analysis=loop-vectorize -S 2>%t | FileCheck %s
+; RUN: cat %t | FileCheck %s -check-prefix=CHECK-REMARKS
+
+; We should not vectorize this loop since we do not have masked loads and stores
+; CHECK-REMARKS: remark: <unknown>:0:0: the cost-model indicates that vectorization is not beneficial
+define void @switch_cost(i32* noalias %a, i32* noalias readonly %b, i32* noalias readonly %c, i64 %N) #0 {
+; CHECK-LABEL: @switch_cost(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[INC:%.*]], [[L4:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[I]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    switch i32 [[TMP0]], label [[L1:%.*]] [
+; CHECK-NEXT:    i32 3, label [[L3:%.*]]
+; CHECK-NEXT:    i32 2, label [[FOR_BODY_L2_CRIT_EDGE:%.*]]
+; CHECK-NEXT:    i32 4, label [[FOR_BODY_L4_CRIT_EDGE:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       for.body.L4_crit_edge:
+; CHECK-NEXT:    [[ARRAYIDX17_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[I]]
+; CHECK-NEXT:    [[DOTPRE1:%.*]] = load i32, i32* [[ARRAYIDX17_PHI_TRANS_INSERT]], align 4
+; CHECK-NEXT:    br label [[L4]]
+; CHECK:       for.body.L2_crit_edge:
+; CHECK-NEXT:    [[ARRAYIDX7_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[I]]
+; CHECK-NEXT:    [[DOTPRE:%.*]] = load i32, i32* [[ARRAYIDX7_PHI_TRANS_INSERT]], align 4
+; CHECK-NEXT:    br label [[L2:%.*]]
+; CHECK:       L1:
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[MUL]], [[TMP0]]
+; CHECK-NEXT:    br label [[L2]]
+; CHECK:       L2:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i32 [ [[DOTPRE]], [[FOR_BODY_L2_CRIT_EDGE]] ], [ [[TMP1]], [[L1]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i32 [ 2, [[FOR_BODY_L2_CRIT_EDGE]] ], [ [[ADD]], [[L1]] ]
+; CHECK-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[TMP2]], [[TMP2]]
+; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[MUL9]], [[TMP3]]
+; CHECK-NEXT:    br label [[L3]]
+; CHECK:       L3:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ [[TMP0]], [[FOR_BODY]] ], [ [[ADD11]], [[L2]] ]
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[I]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX13]], align 4
+; CHECK-NEXT:    [[MUL14:%.*]] = mul nsw i32 [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[ADD16:%.*]] = add nsw i32 [[MUL14]], [[TMP4]]
+; CHECK-NEXT:    br label [[L4]]
+; CHECK:       L4:
+; CHECK-NEXT:    [[TMP6:%.*]] = phi i32 [ [[DOTPRE1]], [[FOR_BODY_L4_CRIT_EDGE]] ], [ [[TMP5]], [[L3]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = phi i32 [ 4, [[FOR_BODY_L4_CRIT_EDGE]] ], [ [[ADD16]], [[L3]] ]
+; CHECK-NEXT:    [[MUL19:%.*]] = mul nsw i32 [[TMP6]], [[TMP6]]
+; CHECK-NEXT:    [[ADD21:%.*]] = add nsw i32 [[MUL19]], [[TMP7]]
+; CHECK-NEXT:    store i32 [[ADD21]], i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %inc, %L4 ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i
+  %0 = load i32, i32* %arrayidx
+  switch i32 %0, label %L1 [
+  i32 4, label %L4
+  i32 2, label %L2
+  i32 3, label %L3
+  ]
+
+L1:
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i
+  %1 = load i32, i32* %arrayidx5
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, %0
+  store i32 %add, i32* %arrayidx
+  br label %L2
+
+L2:
+  %2 = phi i32 [ 2, %for.body ], [ %add, %L1 ]
+  %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %i
+  %3 = load i32, i32* %arrayidx7
+  %mul9 = mul nsw i32 %3, %3
+  %add11 = add nsw i32 %2, %mul9
+  store i32 %add11, i32* %arrayidx
+  br label %L3
+
+L3:
+  %4 = phi i32 [ 3, %for.body ], [ %add11, %L2 ]
+  %arrayidx13 = getelementptr inbounds i32, i32* %c, i64 %i
+  %5 = load i32, i32* %arrayidx13
+  %mul14 = mul nsw i32 %5, %4
+  %add16 = add nsw i32 %mul14, %4
+  store i32 %add16, i32* %arrayidx
+  br label %L4
+
+L4:
+  %6 = phi i32 [ 4, %for.body ], [ %add16, %L3 ]
+  %arrayidx17 = getelementptr inbounds i32, i32* %c, i64 %i
+  %7 = load i32, i32* %arrayidx17
+  %mul19 = mul nsw i32 %7, %7
+  %add21 = add nsw i32 %6, %mul19
+  store i32 %add21, i32* %arrayidx
+  %inc = add nuw nsw i64 %i, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @switch(i32* noalias %a, i32* noalias %b, i64 %N) {
+; CHECK-LABEL: @switch(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP14:%.*]] = icmp sgt i64 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP14]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER5:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -4
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    [[PREDPHI_OP:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> <i32 9, i32 9, i32 9, i32 9>, <4 x i32> <i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>, <4 x i32> [[PREDPHI_OP]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD4]], [[TMP4]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_PREHEADER5]]
+; CHECK:       for.body.preheader5:
+; CHECK-NEXT:    [[I_015_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_015:%.*]] = phi i64 [ [[INC:%.*]], [[L3:%.*]] ], [ [[I_015_PH]], [[FOR_BODY_PREHEADER5]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_015]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    switch i32 [[TMP10]], label [[L1:%.*]] [
+; CHECK-NEXT:    i32 2, label [[L2:%.*]]
+; CHECK-NEXT:    i32 3, label [[L3]]
+; CHECK-NEXT:    ]
+; CHECK:       L1:
+; CHECK-NEXT:    br label [[L2]]
+; CHECK:       L2:
+; CHECK-NEXT:    [[R_0:%.*]] = phi i32 [ 12, [[L1]] ], [ 5, [[FOR_BODY]] ]
+; CHECK-NEXT:    br label [[L3]]
+; CHECK:       L3:
+; CHECK-NEXT:    [[R_1:%.*]] = phi i32 [ [[R_0]], [[L2]] ], [ [[TMP10]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ADD4:%.*]] = add nuw nsw i32 [[R_1]], 4
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I_015]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP11]], [[ADD4]]
+; CHECK-NEXT:    store i32 [[MUL]], i32* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_015]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+;
+
+entry:
+  %cmp14 = icmp sgt i64 %N, 0
+  br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %L3
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %L3
+  %i.015 = phi i64 [ %inc, %L3 ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i.015
+  %0 = load i32, i32* %arrayidx
+  switch i32 %0, label %L1 [
+  i32 3, label %L3
+  i32 2, label %L2
+  ]
+
+L1:                                               ; preds = %for.body
+  br label %L2
+
+L2:                                               ; preds = %for.body, %L1
+  %r.0 = phi i32 [ 12, %L1 ], [ 5, %for.body ]
+  br label %L3
+
+L3:                                               ; preds = %for.body, %L2
+  %r.1 = phi i32 [ %r.0, %L2 ], [ 3, %for.body ]
+  %add4 = add nuw nsw i32 %r.1, 4
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i.015
+  %1 = load i32, i32* %arrayidx5
+  %mul = mul nsw i32 %1, %add4
+  store i32 %mul, i32* %arrayidx5
+  %inc = add nuw nsw i64 %i.015, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !0
+}
+
+define void @switch_VF1_UF2(i32* noalias %a, i32* noalias readonly %b, i32* noalias readonly %c, i64 %N) {
+; CHECK-LABEL: @switch_VF1_UF2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -2
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
+; CHECK-NEXT:    [[INDUCTION3:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDUCTION3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP3]], 2
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[TMP2]], 3
+; CHECK-NEXT:    [[DOTNOT9:%.*]] = icmp eq i32 [[TMP3]], 3
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nsw i32 [[TMP2]], 3
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nsw i32 [[TMP3]], 3
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP4]], i32 2, i32 [[TMP6]]
+; CHECK-NEXT:    [[PREDPHI4:%.*]] = select i1 [[TMP5]], i32 2, i32 [[TMP7]]
+; CHECK-NEXT:    br i1 [[DOTNOT]], label [[PRED_LOAD_CONTINUE:%.*]], label [[PRED_LOAD_IF:%.*]]
+; CHECK:       pred.load.if:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; CHECK:       pred.load.continue:
+; CHECK-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP9]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    br i1 [[DOTNOT9]], label [[PRED_LOAD_CONTINUE6]], label [[PRED_LOAD_IF5:%.*]]
+; CHECK:       pred.load.if5:
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDUCTION3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
+; CHECK:       pred.load.continue6:
+; CHECK-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], [[PRED_LOAD_IF5]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = mul nsw i32 [[TMP10]], 3
+; CHECK-NEXT:    [[TMP15:%.*]] = mul nsw i32 [[TMP13]], 3
+; CHECK-NEXT:    [[TMP16:%.*]] = add nsw i32 [[TMP14]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP17:%.*]] = add nsw i32 [[TMP15]], [[PREDPHI4]]
+; CHECK-NEXT:    [[PREDPHI7:%.*]] = select i1 [[DOTNOT]], i32 3, i32 [[TMP16]]
+; CHECK-NEXT:    [[PREDPHI8:%.*]] = select i1 [[DOTNOT9]], i32 3, i32 [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDUCTION3]]
+; CHECK-NEXT:    [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = load i32, i32* [[TMP19]], align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = shl nsw i32 [[TMP20]], 2
+; CHECK-NEXT:    [[TMP23:%.*]] = shl nsw i32 [[TMP21]], 2
+; CHECK-NEXT:    [[TMP24:%.*]] = add nsw i32 [[TMP22]], [[PREDPHI7]]
+; CHECK-NEXT:    [[TMP25:%.*]] = add nsw i32 [[TMP23]], [[PREDPHI8]]
+; CHECK-NEXT:    store i32 [[TMP24]], i32* [[TMP0]], align 4
+; CHECK-NEXT:    store i32 [[TMP25]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[I_PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[INC:%.*]], [[L3:%.*]] ], [ [[I_PH]], [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    switch i32 [[TMP27]], label [[FOR_BODY_SWITCH2:%.*]] [
+; CHECK-NEXT:    i32 2, label [[L2:%.*]]
+; CHECK-NEXT:    i32 3, label [[L3]]
+; CHECK-NEXT:    ]
+; CHECK:       for.body.switch2:
+; CHECK-NEXT:    [[ADD:%.*]] = mul nsw i32 [[TMP27]], 3
+; CHECK-NEXT:    br label [[L2]]
+; CHECK:       L2:
+; CHECK-NEXT:    [[TMP28:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY_SWITCH2]] ], [ [[TMP27]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    [[TMP29:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[TMP29]], 3
+; CHECK-NEXT:    [[ADD8:%.*]] = add nsw i32 [[MUL6]], [[TMP28]]
+; CHECK-NEXT:    br label [[L3]]
+; CHECK:       L3:
+; CHECK-NEXT:    [[TMP30:%.*]] = phi i32 [ [[ADD8]], [[L2]] ], [ [[TMP27]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[I]]
+; CHECK-NEXT:    [[TMP31:%.*]] = load i32, i32* [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[MUL10:%.*]] = shl nsw i32 [[TMP31]], 2
+; CHECK-NEXT:    [[ADD12:%.*]] = add nsw i32 [[MUL10]], [[TMP30]]
+; CHECK-NEXT:    store i32 [[ADD12]], i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %inc, %L3 ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i
+  %0 = load i32, i32* %arrayidx
+  %switch = icmp eq i32 %0, 3
+  br i1 %switch, label %L3, label %for.body.switch
+
+for.body.switch:
+  %switch1 = icmp eq i32 %0, 2
+  br i1 %switch1, label %L2, label %for.body.switch2
+
+for.body.switch2:
+  %add = mul nsw i32 %0, 3
+  store i32 %add, i32* %arrayidx
+  br label %L2
+
+L2:
+  %1 = phi i32 [ %add, %for.body.switch2 ], [ %0, %for.body.switch ]
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i
+  %2 = load i32, i32* %arrayidx5
+  %mul6 = mul nsw i32 %2, 3
+  %add8 = add nsw i32 %1, %mul6
+  store i32 %add8, i32* %arrayidx
+  br label %L3
+
+L3:
+  %3 = phi i32 [ %0, %for.body ], [ %add8, %L2 ]
+  %arrayidx9 = getelementptr inbounds i32, i32* %c, i64 %i
+  %4 = load i32, i32* %arrayidx9
+  %mul10 = shl nsw i32 %4, 2
+  %add12 = add nsw i32 %3, %mul10
+  store i32 %add12, i32* %arrayidx
+  %inc = add nuw nsw i64 %i, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:
+  ret void
+}
+
+; This loop will not vectorize due to unsafe FP ops, ensure the switch statement is created again in for.body
+define float @switch_no_vectorize(i32* noalias %a, i32* noalias readonly %b, i32* noalias readonly %c, float %val, i64 %N) {
+; CHECK-LABEL: @switch_no_vectorize(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[INC:%.*]], [[L3:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SUM_033:%.*]] = phi float [ [[CONV20:%.*]], [[L3]] ], [ 2.000000e+00, [[ENTRY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[I]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    switch i32 [[TMP0]], label [[L1:%.*]] [
+; CHECK-NEXT:    i32 2, label [[L2:%.*]]
+; CHECK-NEXT:    i32 3, label [[L3]]
+; CHECK-NEXT:    ]
+; CHECK:       L1:
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
+; CHECK-NEXT:    [[CONV4:%.*]] = fpext float [[CONV]] to double
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[CONV4]], 1.000000e+00
+; CHECK-NEXT:    [[CONV5:%.*]] = fpext float [[SUM_033]] to double
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[ADD]], [[CONV5]]
+; CHECK-NEXT:    [[CONV6:%.*]] = fptrunc double [[MUL]] to float
+; CHECK-NEXT:    br label [[L2]]
+; CHECK:       L2:
+; CHECK-NEXT:    [[SUM_1:%.*]] = phi float [ [[CONV6]], [[L1]] ], [ [[SUM_033]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[I]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    [[CONV8:%.*]] = sitofp i32 [[TMP1]] to float
+; CHECK-NEXT:    [[CONV9:%.*]] = fpext float [[CONV8]] to double
+; CHECK-NEXT:    [[ADD10:%.*]] = fadd double [[CONV9]], 2.000000e+00
+; CHECK-NEXT:    [[CONV11:%.*]] = fpext float [[SUM_1]] to double
+; CHECK-NEXT:    [[MUL12:%.*]] = fmul double [[ADD10]], [[CONV11]]
+; CHECK-NEXT:    [[CONV13:%.*]] = fptrunc double [[MUL12]] to float
+; CHECK-NEXT:    br label [[L3]]
+; CHECK:       L3:
+; CHECK-NEXT:    [[SUM_2:%.*]] = phi float [ [[CONV13]], [[L2]] ], [ [[SUM_033]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[I]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX14]], align 4
+; CHECK-NEXT:    [[CONV15:%.*]] = sitofp i32 [[TMP2]] to float
+; CHECK-NEXT:    [[CONV16:%.*]] = fpext float [[CONV15]] to double
+; CHECK-NEXT:    [[ADD17:%.*]] = fadd double [[CONV16]], 3.000000e+00
+; CHECK-NEXT:    [[CONV18:%.*]] = fpext float [[SUM_2]] to double
+; CHECK-NEXT:    [[MUL19:%.*]] = fmul double [[ADD17]], [[CONV18]]
+; CHECK-NEXT:    [[CONV20]] = fptrunc double [[MUL19]] to float
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[CONV20_LCSSA:%.*]] = phi float [ [[CONV20]], [[L3]] ]
+; CHECK-NEXT:    ret float [[CONV20_LCSSA]]
+;
+
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %inc, %L3 ], [ 0, %entry ]
+  %sum.033 = phi float [ %conv20, %L3 ], [ 2.000000e+00, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i
+  %0 = load i32, i32* %arrayidx
+  switch i32 %0, label %L1 [
+  i32 3, label %L3
+  i32 2, label %L2
+  ]
+
+L1:
+  %conv = sitofp i32 %0 to float
+  %conv4 = fpext float %conv to double
+  %add = fadd double %conv4, 1.000000e+00
+  %conv5 = fpext float %sum.033 to double
+  %mul = fmul double %add, %conv5
+  %conv6 = fptrunc double %mul to float
+  br label %L2
+
+L2:
+  %sum.1 = phi float [ %conv6, %L1 ], [ %sum.033, %for.body ]
+  %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %i
+  %1 = load i32, i32* %arrayidx7
+  %conv8 = sitofp i32 %1 to float
+  %conv9 = fpext float %conv8 to double
+  %add10 = fadd double %conv9, 2.000000e+00
+  %conv11 = fpext float %sum.1 to double
+  %mul12 = fmul double %add10, %conv11
+  %conv13 = fptrunc double %mul12 to float
+  br label %L3
+
+L3:
+  %sum.2 = phi float [ %conv13, %L2 ], [ %sum.033, %for.body ]
+  %arrayidx14 = getelementptr inbounds i32, i32* %c, i64 %i
+  %2 = load i32, i32* %arrayidx14
+  %conv15 = sitofp i32 %2 to float
+  %conv16 = fpext float %conv15 to double
+  %add17 = fadd double %conv16, 3.000000e+00
+  %conv18 = fpext float %sum.2 to double
+  %mul19 = fmul double %add17, %conv18
+  %conv20 = fptrunc double %mul19 to float
+  %inc = add nuw nsw i64 %i, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret float %conv20
+}
+
+!0 = distinct !{!0, !2, !4, !6}
+!1 = distinct !{!1, !3, !5, !6}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.vectorize.width", i32 1}
+!4 = !{!"llvm.loop.interleave.count", i32 1}
+!5 = !{!"llvm.loop.interleave.count", i32 2}
+!6 = !{!"llvm.loop.vectorize.enable", i1 true}
Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-remove-switches.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/AArch64/sve-remove-switches.ll
@@ -0,0 +1,413 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -O3 -loop-vectorize -mtriple aarch64-linux-gnu -mattr=+sve -scalable-vectorization=on -S | FileCheck %s
+
+define void @switch(i32* noalias %a, i32* noalias %b, i32* noalias %c, i64 %N) #0 {
+; CHECK-LABEL: @switch(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 2
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 4, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <vscale x 4 x i1> [[TMP9]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP13:%.*]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 false, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <vscale x 4 x i1> [[TMP10]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 false, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP11]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP16]], i32 4, <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP18:%.*]] = add nsw <vscale x 4 x i32> [[TMP17]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP20:%.*]] = select <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 false, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP19]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP21]], i32 4, <vscale x 4 x i1> [[TMP20]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP22:%.*]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 false, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> [[TMP9]]
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast i32* [[TMP11]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD7:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP23]], i32 4, <vscale x 4 x i1> [[TMP22]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD7]]
+; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> [[TMP18]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP24:%.*]] = mul nsw <vscale x 4 x i32> [[PREDPHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP25:%.*]] = add nsw <vscale x 4 x i32> [[TMP24]], [[PREDPHI8]]
+; CHECK-NEXT:    [[TMP26:%.*]] = or <vscale x 4 x i1> [[TMP22]], [[TMP15]]
+; CHECK-NEXT:    [[PREDPHI9:%.*]] = select <vscale x 4 x i1> [[TMP26]], <vscale x 4 x i32> [[TMP25]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP27:%.*]] = or <vscale x 4 x i1> [[TMP8]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i32* [[TMP19]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP28]], i32 4, <vscale x 4 x i1> [[TMP27]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP29:%.*]] = mul nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD10]], [[PREDPHI9]]
+; CHECK-NEXT:    [[TMP30:%.*]] = add nsw <vscale x 4 x i32> [[TMP29]], [[PREDPHI9]]
+; CHECK-NEXT:    [[PREDPHI11:%.*]] = select <vscale x 4 x i1> [[TMP27]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD10]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD6]]
+; CHECK-NEXT:    [[PREDPHI12:%.*]] = select <vscale x 4 x i1> [[TMP27]], <vscale x 4 x i32> [[TMP30]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 4, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP31:%.*]] = mul nsw <vscale x 4 x i32> [[PREDPHI11]], [[PREDPHI11]]
+; CHECK-NEXT:    [[TMP32:%.*]] = add nsw <vscale x 4 x i32> [[TMP31]], [[PREDPHI12]]
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast i32* [[TMP6]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP32]], <vscale x 4 x i32>* [[TMP33]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[I_PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[INC:%.*]], [[L4:%.*]] ], [ [[I_PH]], [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]]
+; CHECK-NEXT:    [[TMP35:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    switch i32 [[TMP35]], label [[L1:%.*]] [
+; CHECK-NEXT:    i32 3, label [[L3:%.*]]
+; CHECK-NEXT:    i32 2, label [[FOR_BODY_L2_CRIT_EDGE:%.*]]
+; CHECK-NEXT:    i32 4, label [[FOR_BODY_L4_CRIT_EDGE:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       for.body.L4_crit_edge:
+; CHECK-NEXT:    [[ARRAYIDX17_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[I]]
+; CHECK-NEXT:    [[DOTPRE1:%.*]] = load i32, i32* [[ARRAYIDX17_PHI_TRANS_INSERT]], align 4
+; CHECK-NEXT:    br label [[L4]]
+; CHECK:       for.body.L2_crit_edge:
+; CHECK-NEXT:    [[ARRAYIDX7_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    [[DOTPRE:%.*]] = load i32, i32* [[ARRAYIDX7_PHI_TRANS_INSERT]], align 4
+; CHECK-NEXT:    br label [[L2:%.*]]
+; CHECK:       L1:
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    [[TMP36:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP36]], [[TMP35]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[MUL]], [[TMP35]]
+; CHECK-NEXT:    br label [[L2]]
+; CHECK:       L2:
+; CHECK-NEXT:    [[TMP37:%.*]] = phi i32 [ [[DOTPRE]], [[FOR_BODY_L2_CRIT_EDGE]] ], [ [[TMP36]], [[L1]] ]
+; CHECK-NEXT:    [[TMP38:%.*]] = phi i32 [ 2, [[FOR_BODY_L2_CRIT_EDGE]] ], [ [[ADD]], [[L1]] ]
+; CHECK-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[TMP37]], [[TMP37]]
+; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[MUL9]], [[TMP38]]
+; CHECK-NEXT:    br label [[L3]]
+; CHECK:       L3:
+; CHECK-NEXT:    [[TMP39:%.*]] = phi i32 [ [[TMP35]], [[FOR_BODY]] ], [ [[ADD11]], [[L2]] ]
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[I]]
+; CHECK-NEXT:    [[TMP40:%.*]] = load i32, i32* [[ARRAYIDX13]], align 4
+; CHECK-NEXT:    [[MUL14:%.*]] = mul nsw i32 [[TMP40]], [[TMP39]]
+; CHECK-NEXT:    [[ADD16:%.*]] = add nsw i32 [[MUL14]], [[TMP39]]
+; CHECK-NEXT:    br label [[L4]]
+; CHECK:       L4:
+; CHECK-NEXT:    [[TMP41:%.*]] = phi i32 [ [[DOTPRE1]], [[FOR_BODY_L4_CRIT_EDGE]] ], [ [[TMP40]], [[L3]] ]
+; CHECK-NEXT:    [[TMP42:%.*]] = phi i32 [ 4, [[FOR_BODY_L4_CRIT_EDGE]] ], [ [[ADD16]], [[L3]] ]
+; CHECK-NEXT:    [[MUL19:%.*]] = mul nsw i32 [[TMP41]], [[TMP41]]
+; CHECK-NEXT:    [[ADD21:%.*]] = add nsw i32 [[MUL19]], [[TMP42]]
+; CHECK-NEXT:    store i32 [[ADD21]], i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %inc, %L4 ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i
+  %0 = load i32, i32* %arrayidx
+  switch i32 %0, label %L1 [
+  i32 4, label %L4
+  i32 2, label %L2
+  i32 3, label %L3
+  ]
+
+L1:
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i
+  %1 = load i32, i32* %arrayidx5
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, %0
+  store i32 %add, i32* %arrayidx
+  br label %L2
+
+L2:
+  %2 = phi i32 [ 2, %for.body ], [ %add, %L1 ]
+  %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %i
+  %3 = load i32, i32* %arrayidx7
+  %mul9 = mul nsw i32 %3, %3
+  %add11 = add nsw i32 %2, %mul9
+  store i32 %add11, i32* %arrayidx
+  br label %L3
+
+L3:
+  %4 = phi i32 [ 3, %for.body ], [ %add11, %L2 ]
+  %arrayidx13 = getelementptr inbounds i32, i32* %c, i64 %i
+  %5 = load i32, i32* %arrayidx13
+  %mul14 = mul nsw i32 %5, %4
+  %add16 = add nsw i32 %mul14, %4
+  store i32 %add16, i32* %arrayidx
+  br label %L4
+
+L4:
+  %6 = phi i32 [ 4, %for.body ], [ %add16, %L3 ]
+  %arrayidx17 = getelementptr inbounds i32, i32* %c, i64 %i
+  %7 = load i32, i32* %arrayidx17
+  %mul19 = mul nsw i32 %7, %7
+  %add21 = add nsw i32 %6, %mul19
+  store i32 %add21, i32* %arrayidx
+  %inc = add nuw nsw i64 %i, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @switch_VF1_UF2(i32* noalias %a, i32* noalias %b, i32* noalias %c, i64 %N) #0 {
+; CHECK-LABEL: @switch_VF1_UF2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -2
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
+; CHECK-NEXT:    [[INDUCTION3:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <2 x i32> [[TMP2]], <i32 2, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <2 x i32> [[TMP2]], <i32 3, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nsw <2 x i32> [[TMP2]], <i32 3, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> <i32 2, i32 2>, <2 x i32> [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; CHECK-NEXT:    br i1 [[TMP7]], label [[PRED_LOAD_CONTINUE:%.*]], label [[PRED_LOAD_IF:%.*]]
+; CHECK:       pred.load.if:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; CHECK:       pred.load.continue:
+; CHECK-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP9]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_LOAD_CONTINUE6]], label [[PRED_LOAD_IF5:%.*]]
+; CHECK:       pred.load.if5:
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDUCTION3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
+; CHECK:       pred.load.continue6:
+; CHECK-NEXT:    [[TMP14:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF5]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP14]], i32 1
+; CHECK-NEXT:    [[TMP17:%.*]] = mul nsw <2 x i32> [[TMP16]], <i32 3, i32 3>
+; CHECK-NEXT:    [[TMP18:%.*]] = add nsw <2 x i32> [[TMP17]], [[TMP6]]
+; CHECK-NEXT:    [[TMP19:%.*]] = select <2 x i1> [[TMP4]], <2 x i32> <i32 3, i32 3>, <2 x i32> [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP22:%.*]] = load <2 x i32>, <2 x i32>* [[TMP21]], align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = shl nsw <2 x i32> [[TMP22]], <i32 2, i32 2>
+; CHECK-NEXT:    [[TMP24:%.*]] = add nsw <2 x i32> [[TMP23]], [[TMP19]]
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast i32* [[TMP0]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[TMP24]], <2 x i32>* [[TMP25]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[I_PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[INC:%.*]], [[L3:%.*]] ], [ [[I_PH]], [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    switch i32 [[TMP27]], label [[FOR_BODY_SWITCH2:%.*]] [
+; CHECK-NEXT:    i32 2, label [[L2:%.*]]
+; CHECK-NEXT:    i32 3, label [[L3]]
+; CHECK-NEXT:    ]
+; CHECK:       for.body.switch2:
+; CHECK-NEXT:    [[ADD:%.*]] = mul nsw i32 [[TMP27]], 3
+; CHECK-NEXT:    br label [[L2]]
+; CHECK:       L2:
+; CHECK-NEXT:    [[TMP28:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY_SWITCH2]] ], [ [[TMP27]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    [[TMP29:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[TMP29]], 3
+; CHECK-NEXT:    [[ADD8:%.*]] = add nsw i32 [[MUL6]], [[TMP28]]
+; CHECK-NEXT:    br label [[L3]]
+; CHECK:       L3:
+; CHECK-NEXT:    [[TMP30:%.*]] = phi i32 [ [[ADD8]], [[L2]] ], [ [[TMP27]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[I]]
+; CHECK-NEXT:    [[TMP31:%.*]] = load i32, i32* [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[MUL10:%.*]] = shl nsw i32 [[TMP31]], 2
+; CHECK-NEXT:    [[ADD12:%.*]] = add nsw i32 [[MUL10]], [[TMP30]]
+; CHECK-NEXT:    store i32 [[ADD12]], i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %inc, %L3 ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i
+  %0 = load i32, i32* %arrayidx
+  %switch = icmp eq i32 %0, 3
+  br i1 %switch, label %L3, label %for.body.switch
+
+for.body.switch:
+  %switch1 = icmp eq i32 %0, 2
+  br i1 %switch1, label %L2, label %for.body.switch2
+
+for.body.switch2:
+  %add = mul nsw i32 %0, 3
+  store i32 %add, i32* %arrayidx
+  br label %L2
+
+L2:
+  %1 = phi i32 [ %add, %for.body.switch2 ], [ %0, %for.body.switch ]
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i
+  %2 = load i32, i32* %arrayidx5
+  %mul6 = mul nsw i32 %2, 3
+  %add8 = add nsw i32 %1, %mul6
+  store i32 %add8, i32* %arrayidx
+  br label %L3
+
+L3:
+  %3 = phi i32 [ %0, %for.body ], [ %add8, %L2 ]
+  %arrayidx9 = getelementptr inbounds i32, i32* %c, i64 %i
+  %4 = load i32, i32* %arrayidx9
+  %mul10 = shl nsw i32 %4, 2
+  %add12 = add nsw i32 %3, %mul10
+  store i32 %add12, i32* %arrayidx
+  %inc = add nuw nsw i64 %i, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+; This loop will not vectorize due to unsafe FP ops, ensure the switch statement is created again in for.body
+define float @switch_no_vectorize(i32* noalias %a, i32* noalias %b, i32* noalias %c, float %val, i64 %N) {
+; CHECK-LABEL: @switch_no_vectorize(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[INC:%.*]], [[L3:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SUM_033:%.*]] = phi float [ [[CONV20:%.*]], [[L3]] ], [ 2.000000e+00, [[ENTRY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[I]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    switch i32 [[TMP0]], label [[L1:%.*]] [
+; CHECK-NEXT:    i32 2, label [[L2:%.*]]
+; CHECK-NEXT:    i32 3, label [[L3]]
+; CHECK-NEXT:    ]
+; CHECK:       L1:
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
+; CHECK-NEXT:    [[CONV4:%.*]] = fpext float [[CONV]] to double
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[CONV4]], 1.000000e+00
+; CHECK-NEXT:    [[CONV5:%.*]] = fpext float [[SUM_033]] to double
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[ADD]], [[CONV5]]
+; CHECK-NEXT:    [[CONV6:%.*]] = fptrunc double [[MUL]] to float
+; CHECK-NEXT:    br label [[L2]]
+; CHECK:       L2:
+; CHECK-NEXT:    [[SUM_1:%.*]] = phi float [ [[CONV6]], [[L1]] ], [ [[SUM_033]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[I]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    [[CONV8:%.*]] = sitofp i32 [[TMP1]] to float
+; CHECK-NEXT:    [[CONV9:%.*]] = fpext float [[CONV8]] to double
+; CHECK-NEXT:    [[ADD10:%.*]] = fadd double [[CONV9]], 2.000000e+00
+; CHECK-NEXT:    [[CONV11:%.*]] = fpext float [[SUM_1]] to double
+; CHECK-NEXT:    [[MUL12:%.*]] = fmul double [[ADD10]], [[CONV11]]
+; CHECK-NEXT:    [[CONV13:%.*]] = fptrunc double [[MUL12]] to float
+; CHECK-NEXT:    br label [[L3]]
+; CHECK:       L3:
+; CHECK-NEXT:    [[SUM_2:%.*]] = phi float [ [[CONV13]], [[L2]] ], [ [[SUM_033]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[I]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX14]], align 4
+; CHECK-NEXT:    [[CONV15:%.*]] = sitofp i32 [[TMP2]] to float
+; CHECK-NEXT:    [[CONV16:%.*]] = fpext float [[CONV15]] to double
+; CHECK-NEXT:    [[ADD17:%.*]] = fadd double [[CONV16]], 3.000000e+00
+; CHECK-NEXT:    [[CONV18:%.*]] = fpext float [[SUM_2]] to double
+; CHECK-NEXT:    [[MUL19:%.*]] = fmul double [[ADD17]], [[CONV18]]
+; CHECK-NEXT:    [[CONV20]] = fptrunc double [[MUL19]] to float
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[CONV20_LCSSA:%.*]] = phi float [ [[CONV20]], [[L3]] ]
+; CHECK-NEXT:    ret float [[CONV20_LCSSA]]
+;
+
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %inc, %L3 ], [ 0, %entry ]
+  %sum.033 = phi float [ %conv20, %L3 ], [ 2.000000e+00, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i
+  %0 = load i32, i32* %arrayidx
+  switch i32 %0, label %L1 [
+  i32 3, label %L3
+  i32 2, label %L2
+  ]
+
+L1:
+  %conv = sitofp i32 %0 to float
+  %conv4 = fpext float %conv to double
+  %add = fadd double %conv4, 1.000000e+00
+  %conv5 = fpext float %sum.033 to double
+  %mul = fmul double %add, %conv5
+  %conv6 = fptrunc double %mul to float
+  br label %L2
+
+L2:
+  %sum.1 = phi float [ %conv6, %L1 ], [ %sum.033, %for.body ]
+  %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %i
+  %1 = load i32, i32* %arrayidx7
+  %conv8 = sitofp i32 %1 to float
+  %conv9 = fpext float %conv8 to double
+  %add10 = fadd double %conv9, 2.000000e+00
+  %conv11 = fpext float %sum.1 to double
+  %mul12 = fmul double %add10, %conv11
+  %conv13 = fptrunc double %mul12 to float
+  br label %L3
+
+L3:
+  %sum.2 = phi float [ %conv13, %L2 ], [ %sum.033, %for.body ]
+  %arrayidx14 = getelementptr inbounds i32, i32* %c, i64 %i
+  %2 = load i32, i32* %arrayidx14
+  %conv15 = sitofp i32 %2 to float
+  %conv16 = fpext float %conv15 to double
+  %add17 = fadd double %conv16, 3.000000e+00
+  %conv18 = fpext float %sum.2 to double
+  %mul19 = fmul double %add17, %conv18
+  %conv20 = fptrunc double %mul19 to float
+  %inc = add nuw nsw i64 %i, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret float %conv20
+}
+
+!0 = distinct !{!0, !1, !2, !3, !4}
+!1 = !{!"llvm.loop.vectorize.width", i32 1}
+!2 = !{!"llvm.loop.interleave.count", i32 2}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}
+!4 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
Index: llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
===================================================================
--- llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -180,6 +180,8 @@
 ; CHECK-O-NEXT: Running pass: LoopRotatePass
 ; CHECK-O-NEXT: Running pass: LoopDistributePass
 ; CHECK-O-NEXT: Running pass: InjectTLIMappings
+; CHECK-O-NEXT: Running pass: LowerSwitchPass
+; CHECK-O-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O-NEXT: Running pass: LoopVectorizePass
 ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
Index: llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
===================================================================
--- llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -168,6 +168,8 @@
 ; CHECK-O-NEXT: Running pass: LoopRotatePass
 ; CHECK-O-NEXT: Running pass: LoopDistributePass
 ; CHECK-O-NEXT: Running pass: InjectTLIMappings
+; CHECK-O-NEXT: Running pass: LowerSwitchPass
+; CHECK-O-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O-NEXT: Running pass: LoopVectorizePass
 ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
Index: llvm/test/Other/new-pm-thinlto-defaults.ll
===================================================================
--- llvm/test/Other/new-pm-thinlto-defaults.ll
+++ llvm/test/Other/new-pm-thinlto-defaults.ll
@@ -197,6 +197,8 @@
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopRotatePass
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopDistributePass
 ; CHECK-POSTLINK-O-NEXT: Running pass: InjectTLIMappings
+; CHECK-POSTLINK-O-NEXT: Running pass: LowerSwitchPass
+; CHECK-POSTLINK-O-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopVectorizePass
 ; CHECK-POSTLINK-O-NEXT: Running analysis: BlockFrequencyAnalysis
 ; CHECK-POSTLINK-O-NEXT: Running analysis: BranchProbabilityAnalysis
Index: llvm/test/Other/new-pm-lto-defaults.ll
===================================================================
--- llvm/test/Other/new-pm-lto-defaults.ll
+++ llvm/test/Other/new-pm-lto-defaults.ll
@@ -105,6 +105,7 @@
 ; CHECK-O23SZ-NEXT: Running pass: LoopDeletionPass on Loop
 ; CHECK-O23SZ-NEXT: Running pass: LoopFullUnrollPass on Loop
 ; CHECK-O23SZ-NEXT: Running pass: LoopDistributePass on foo
+; CHECK-O23SZ-NEXT: Running pass: LowerSwitchPass on foo
 ; CHECK-O23SZ-NEXT: Running pass: LoopVectorizePass on foo
 ; CHECK-O23SZ-NEXT: Running analysis: BlockFrequencyAnalysis on foo
 ; CHECK-O23SZ-NEXT: Running analysis: BranchProbabilityAnalysis on foo
Index: llvm/test/Other/new-pm-defaults.ll
===================================================================
--- llvm/test/Other/new-pm-defaults.ll
+++ llvm/test/Other/new-pm-defaults.ll
@@ -216,6 +216,8 @@
 ; CHECK-O-NEXT: Running pass: LoopRotatePass
 ; CHECK-O-NEXT: Running pass: LoopDistributePass
 ; CHECK-O-NEXT: Running pass: InjectTLIMappings
+; CHECK-O-NEXT: Running pass: LowerSwitchPass
+; CHECK-O-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O-NEXT: Running pass: LoopVectorizePass
 ; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis
 ; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis
Index: llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -56,6 +56,8 @@
 ; GCN-O0-NEXT:      FunctionPass Manager
 ; GCN-O0-NEXT:        AMDGPU Lower Kernel Arguments
 ; GCN-O0-NEXT:        Lazy Value Information Analysis
+; GCN-O0-NEXT:        Dominator Tree Construction
+; GCN-O0-NEXT:        Natural Loop Information
 ; GCN-O0-NEXT:        Lower SwitchInst's to branches
 ; GCN-O0-NEXT:        Lower invoke and unwind, for unwindless code generators
 ; GCN-O0-NEXT:        Remove unreachable blocks from the CFG
@@ -65,16 +67,22 @@
 ; GCN-O0-NEXT:        Legacy Divergence Analysis
 ; GCN-O0-NEXT:        Unify divergent function exit nodes
 ; GCN-O0-NEXT:        Lazy Value Information Analysis
+; GCN-O0-NEXT:        Dominator Tree Construction
+; GCN-O0-NEXT:        Natural Loop Information
 ; GCN-O0-NEXT:        Lower SwitchInst's to branches
 ; GCN-O0-NEXT:        Dominator Tree Construction
 ; GCN-O0-NEXT:        Natural Loop Information
 ; GCN-O0-NEXT:        Convert irreducible control-flow into natural loops
+; GCN-O0-NEXT:        Dominator Tree Construction
+; GCN-O0-NEXT:        Natural Loop Information
 ; GCN-O0-NEXT:        Fixup each natural loop to have a single exit block
+; GCN-O0-NEXT:        Dominator Tree Construction
 ; GCN-O0-NEXT:        Post-Dominator Tree Construction
 ; GCN-O0-NEXT:        Dominance Frontier Construction
 ; GCN-O0-NEXT:        Detect single entry single exit regions
 ; GCN-O0-NEXT:        Region Pass Manager
 ; GCN-O0-NEXT:          Structurize control flow
+; GCN-O0-NEXT:        Dominator Tree Construction
 ; GCN-O0-NEXT:        Post-Dominator Tree Construction
 ; GCN-O0-NEXT:        Natural Loop Information
 ; GCN-O0-NEXT:        Legacy Divergence Analysis
@@ -223,6 +231,8 @@
 ; GCN-O1-NEXT:        Natural Loop Information
 ; GCN-O1-NEXT:        CodeGen Prepare
 ; GCN-O1-NEXT:        Lazy Value Information Analysis
+; GCN-O1-NEXT:        Dominator Tree Construction
+; GCN-O1-NEXT:        Natural Loop Information
 ; GCN-O1-NEXT:        Lower SwitchInst's to branches
 ; GCN-O1-NEXT:        Lower invoke and unwind, for unwindless code generators
 ; GCN-O1-NEXT:        Remove unreachable blocks from the CFG
@@ -241,16 +251,22 @@
 ; GCN-O1-NEXT:        Legacy Divergence Analysis
 ; GCN-O1-NEXT:        Unify divergent function exit nodes
 ; GCN-O1-NEXT:        Lazy Value Information Analysis
+; GCN-O1-NEXT:        Dominator Tree Construction
+; GCN-O1-NEXT:        Natural Loop Information
 ; GCN-O1-NEXT:        Lower SwitchInst's to branches
 ; GCN-O1-NEXT:        Dominator Tree Construction
 ; GCN-O1-NEXT:        Natural Loop Information
 ; GCN-O1-NEXT:        Convert irreducible control-flow into natural loops
+; GCN-O1-NEXT:        Dominator Tree Construction
+; GCN-O1-NEXT:        Natural Loop Information
 ; GCN-O1-NEXT:        Fixup each natural loop to have a single exit block
+; GCN-O1-NEXT:        Dominator Tree Construction
 ; GCN-O1-NEXT:        Post-Dominator Tree Construction
 ; GCN-O1-NEXT:        Dominance Frontier Construction
 ; GCN-O1-NEXT:        Detect single entry single exit regions
 ; GCN-O1-NEXT:        Region Pass Manager
 ; GCN-O1-NEXT:          Structurize control flow
+; GCN-O1-NEXT:        Dominator Tree Construction
 ; GCN-O1-NEXT:        Post-Dominator Tree Construction
 ; GCN-O1-NEXT:        Natural Loop Information
 ; GCN-O1-NEXT:        Legacy Divergence Analysis
@@ -517,16 +533,22 @@
 ; GCN-O1-OPTS-NEXT:        Legacy Divergence Analysis
 ; GCN-O1-OPTS-NEXT:        Unify divergent function exit nodes
 ; GCN-O1-OPTS-NEXT:        Lazy Value Information Analysis
+; GCN-O1-OPTS-NEXT:        Dominator Tree Construction
+; GCN-O1-OPTS-NEXT:        Natural Loop Information
 ; GCN-O1-OPTS-NEXT:        Lower SwitchInst's to branches
 ; GCN-O1-OPTS-NEXT:        Dominator Tree Construction
 ; GCN-O1-OPTS-NEXT:        Natural Loop Information
 ; GCN-O1-OPTS-NEXT:        Convert irreducible control-flow into natural loops
+; GCN-O1-OPTS-NEXT:        Dominator Tree Construction
+; GCN-O1-OPTS-NEXT:        Natural Loop Information
 ; GCN-O1-OPTS-NEXT:        Fixup each natural loop to have a single exit block
+; GCN-O1-OPTS-NEXT:        Dominator Tree Construction
 ; GCN-O1-OPTS-NEXT:        Post-Dominator Tree Construction
 ; GCN-O1-OPTS-NEXT:        Dominance Frontier Construction
 ; GCN-O1-OPTS-NEXT:        Detect single entry single exit regions
 ; GCN-O1-OPTS-NEXT:        Region Pass Manager
 ; GCN-O1-OPTS-NEXT:          Structurize control flow
+; GCN-O1-OPTS-NEXT:        Dominator Tree Construction
 ; GCN-O1-OPTS-NEXT:        Post-Dominator Tree Construction
 ; GCN-O1-OPTS-NEXT:        Natural Loop Information
 ; GCN-O1-OPTS-NEXT:        Legacy Divergence Analysis
@@ -801,16 +823,22 @@
 ; GCN-O2-NEXT:        Legacy Divergence Analysis
 ; GCN-O2-NEXT:        Unify divergent function exit nodes
 ; GCN-O2-NEXT:        Lazy Value Information Analysis
+; GCN-O2-NEXT:        Dominator Tree Construction
+; GCN-O2-NEXT:        Natural Loop Information
 ; GCN-O2-NEXT:        Lower SwitchInst's to branches
 ; GCN-O2-NEXT:        Dominator Tree Construction
 ; GCN-O2-NEXT:        Natural Loop Information
 ; GCN-O2-NEXT:        Convert irreducible control-flow into natural loops
+; GCN-O2-NEXT:        Dominator Tree Construction
+; GCN-O2-NEXT:        Natural Loop Information
 ; GCN-O2-NEXT:        Fixup each natural loop to have a single exit block
+; GCN-O2-NEXT:        Dominator Tree Construction
 ; GCN-O2-NEXT:        Post-Dominator Tree Construction
 ; GCN-O2-NEXT:        Dominance Frontier Construction
 ; GCN-O2-NEXT:        Detect single entry single exit regions
 ; GCN-O2-NEXT:        Region Pass Manager
 ; GCN-O2-NEXT:          Structurize control flow
+; GCN-O2-NEXT:        Dominator Tree Construction
 ; GCN-O2-NEXT:        Post-Dominator Tree Construction
 ; GCN-O2-NEXT:        Natural Loop Information
 ; GCN-O2-NEXT:        Legacy Divergence Analysis
@@ -1100,16 +1128,22 @@
 ; GCN-O3-NEXT:        Legacy Divergence Analysis
 ; GCN-O3-NEXT:        Unify divergent function exit nodes
 ; GCN-O3-NEXT:        Lazy Value Information Analysis
+; GCN-O3-NEXT:        Dominator Tree Construction
+; GCN-O3-NEXT:        Natural Loop Information
 ; GCN-O3-NEXT:        Lower SwitchInst's to branches
 ; GCN-O3-NEXT:        Dominator Tree Construction
 ; GCN-O3-NEXT:        Natural Loop Information
 ; GCN-O3-NEXT:        Convert irreducible control-flow into natural loops
+; GCN-O3-NEXT:        Dominator Tree Construction
+; GCN-O3-NEXT:        Natural Loop Information
 ; GCN-O3-NEXT:        Fixup each natural loop to have a single exit block
+; GCN-O3-NEXT:        Dominator Tree Construction
 ; GCN-O3-NEXT:        Post-Dominator Tree Construction
 ; GCN-O3-NEXT:        Dominance Frontier Construction
 ; GCN-O3-NEXT:        Detect single entry single exit regions
 ; GCN-O3-NEXT:        Region Pass Manager
 ; GCN-O3-NEXT:          Structurize control flow
+; GCN-O3-NEXT:        Dominator Tree Construction
 ; GCN-O3-NEXT:        Post-Dominator Tree Construction
 ; GCN-O3-NEXT:        Natural Loop Information
 ; GCN-O3-NEXT:        Legacy Divergence Analysis
Index: llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
===================================================================
--- llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
+++ llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
@@ -40,8 +40,6 @@
     AU.addRequired<LoopInfoWrapperPass>();
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addPreservedID(LowerSwitchID);
-    AU.addPreserved<LoopInfoWrapperPass>();
-    AU.addPreserved<DominatorTreeWrapperPass>();
   }
 
   bool runOnFunction(Function &F) override;
Index: llvm/lib/Transforms/Utils/LowerSwitch.cpp
===================================================================
--- llvm/lib/Transforms/Utils/LowerSwitch.cpp
+++ llvm/lib/Transforms/Utils/LowerSwitch.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
@@ -32,6 +33,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/KnownBits.h"
@@ -49,6 +51,10 @@
 
 #define DEBUG_TYPE "lower-switch"
 
+static cl::opt<bool>
+    ForceLoopUnswitch("force-loop-unswitch", cl::Hidden, cl::init(false),
+                      cl::desc("Unswitch simple switches in loops"));
+
 namespace {
 
   struct IntRange {
@@ -106,6 +112,45 @@
   return O << "]";
 }
 
+namespace {
+class LowerSwitch {
+
+private:
+  LoopInfo *LI;
+  bool LoopUnswitch;
+
+public:
+  LowerSwitch(LoopInfo *LI, bool &LoopUnswitch)
+      : LI(LI), LoopUnswitch(LoopUnswitch) {}
+
+  bool run();
+
+  void FixPhis(BasicBlock *SuccBB, BasicBlock *OrigBB, BasicBlock *NewBB,
+               const unsigned NumMergedCases);
+
+  BasicBlock *NewLeafBlock(CaseRange &Leaf, Value *Val, ConstantInt *LowerBound,
+                           ConstantInt *UpperBound, BasicBlock *OrigBlock,
+                           BasicBlock *Default);
+
+  BasicBlock *SimpleSwitchConvert(SwitchInst *SI, BasicBlock *OrigBlock,
+                                  BasicBlock *DefaultBlock);
+
+  BasicBlock *SwitchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound,
+                            ConstantInt *UpperBound, Value *Val,
+                            BasicBlock *Predecessor, BasicBlock *OrigBlock,
+                            BasicBlock *Default,
+                            const std::vector<IntRange> &UnreachableRanges);
+
+  unsigned Clusterify(CaseVector &Cases, SwitchInst *SI);
+
+  void ProcessSwitchInst(SwitchInst *SI,
+                         SmallPtrSetImpl<BasicBlock *> &DeleteList,
+                         AssumptionCache *AC, LazyValueInfo *LVI);
+
+  bool LowerSwitches(Function &F, LazyValueInfo *LVI, AssumptionCache *AC);
+};
+} // namespace
+
 /// Update the first occurrence of the "switch statement" BB in the PHI
 /// node with the "new" BB. The other occurrences will:
 ///
@@ -116,7 +161,7 @@
 /// 2) Removed if subsequent incoming values now share the same case, i.e.,
 /// multiple outcome edges are condensed into one. This is necessary to keep the
 /// number of phi values equal to the number of branches to SuccBB.
-void FixPhis(
+void LowerSwitch::FixPhis(
     BasicBlock *SuccBB, BasicBlock *OrigBB, BasicBlock *NewBB,
     const unsigned NumMergedCases = std::numeric_limits<unsigned>::max()) {
   for (BasicBlock::iterator I = SuccBB->begin(),
@@ -153,7 +198,7 @@
 /// switch's value == the case's value. If not, then it jumps to the default
 /// branch. At this point in the tree, the value can't be another valid case
 /// value, so the jump to the "default" branch is warranted.
-BasicBlock *NewLeafBlock(CaseRange &Leaf, Value *Val, ConstantInt *LowerBound,
+BasicBlock *LowerSwitch::NewLeafBlock(CaseRange &Leaf, Value *Val, ConstantInt *LowerBound,
                          ConstantInt *UpperBound, BasicBlock *OrigBlock,
                          BasicBlock *Default) {
   Function *F = OrigBlock->getParent();
@@ -213,16 +258,32 @@
   return NewLeaf;
 }
 
+BasicBlock *LowerSwitch::SimpleSwitchConvert(SwitchInst *SI,
+                                             BasicBlock *OrigBlock,
+                                             BasicBlock *DefaultBlock) {
+  BasicBlock *FalseDest = DefaultBlock;
+
+  for (auto CI : SI->cases()) {
+    BasicBlock *TrueDest = CI.getCaseSuccessor();
+    CaseRange Case = CaseRange(CI.getCaseValue(), CI.getCaseValue(), TrueDest);
+    FalseDest = NewLeafBlock(Case, SI->getCondition(), Case.Low, Case.High,
+                             OrigBlock, FalseDest);
+  }
+
+  return FalseDest;
+}
+
 /// Convert the switch statement into a binary lookup of the case values.
 /// The function recursively builds this tree. LowerBound and UpperBound are
 /// used to keep track of the bounds for Val that have already been checked by
 /// a block emitted by one of the previous calls to switchConvert in the call
 /// stack.
-BasicBlock *SwitchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound,
-                          ConstantInt *UpperBound, Value *Val,
-                          BasicBlock *Predecessor, BasicBlock *OrigBlock,
-                          BasicBlock *Default,
-                          const std::vector<IntRange> &UnreachableRanges) {
+BasicBlock *
+LowerSwitch::SwitchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound,
+                           ConstantInt *UpperBound, Value *Val,
+                           BasicBlock *Predecessor, BasicBlock *OrigBlock,
+                           BasicBlock *Default,
+                           const std::vector<IntRange> &UnreachableRanges) {
   assert(LowerBound && UpperBound && "Bounds must be initialized");
   unsigned Size = End - Begin;
 
@@ -301,7 +362,7 @@
 /// Transform simple list of \p SI's cases into list of CaseRange's \p Cases.
 /// \post \p Cases wouldn't contain references to \p SI's default BB.
 /// \returns Number of \p SI's cases that do not reference \p SI's default BB.
-unsigned Clusterify(CaseVector &Cases, SwitchInst *SI) {
+unsigned LowerSwitch::Clusterify(CaseVector &Cases, SwitchInst *SI) {
   unsigned NumSimpleCases = 0;
 
   // Start with "simple" cases
@@ -342,9 +403,9 @@
 
 /// Replace the specified switch instruction with a sequence of chained if-then
 /// insts in a balanced binary search.
-void ProcessSwitchInst(SwitchInst *SI,
-                       SmallPtrSetImpl<BasicBlock *> &DeleteList,
-                       AssumptionCache *AC, LazyValueInfo *LVI) {
+void LowerSwitch::ProcessSwitchInst(SwitchInst *SI,
+                                    SmallPtrSetImpl<BasicBlock *> &DeleteList,
+                                    AssumptionCache *AC, LazyValueInfo *LVI) {
   BasicBlock *OrigBlock = SI->getParent();
   Function *F = OrigBlock->getParent();
   Value *Val = SI->getCondition();  // The value we are switching on...
@@ -374,6 +435,17 @@
     return;
   }
 
+  bool SimpleSwitch = true;
+  for (auto Case : SI->cases())
+    if (!SI->findCaseDest(Case.getCaseSuccessor()))
+      SimpleSwitch = false;
+
+  // If we're running this pass before loop vectorise, we should only
+  // attempt to convert simple switches which are in a loop
+  if ((LoopUnswitch || ForceLoopUnswitch) &&
+      (!SimpleSwitch || !LI->getLoopFor(OrigBlock)))
+    return;
+
   ConstantInt *LowerBound = nullptr;
   ConstantInt *UpperBound = nullptr;
   bool DefaultIsUnreachableFromSwitch = false;
@@ -418,7 +490,7 @@
 
   std::vector<IntRange> UnreachableRanges;
 
-  if (DefaultIsUnreachableFromSwitch) {
+  if (DefaultIsUnreachableFromSwitch && !(LoopUnswitch || ForceLoopUnswitch)) {
     DenseMap<BasicBlock *, unsigned> Popularity;
     unsigned MaxPop = 0;
     BasicBlock *PopSucc = nullptr;
@@ -500,9 +572,13 @@
   F->getBasicBlockList().insert(Default->getIterator(), NewDefault);
   BranchInst::Create(Default, NewDefault);
 
-  BasicBlock *SwitchBlock =
-      SwitchConvert(Cases.begin(), Cases.end(), LowerBound, UpperBound, Val,
-                    OrigBlock, OrigBlock, NewDefault, UnreachableRanges);
+  BasicBlock *SwitchBlock;
+  if ((LoopUnswitch || ForceLoopUnswitch) && SimpleSwitch)
+    SwitchBlock = SimpleSwitchConvert(SI, OrigBlock, NewDefault);
+  else
+    SwitchBlock =
+        SwitchConvert(Cases.begin(), Cases.end(), LowerBound, UpperBound, Val,
+                      OrigBlock, OrigBlock, NewDefault, UnreachableRanges);
 
   // If there are entries in any PHI nodes for the default edge, make sure
   // to update them as well.
@@ -520,7 +596,8 @@
     DeleteList.insert(OldDefault);
 }
 
-bool LowerSwitch(Function &F, LazyValueInfo *LVI, AssumptionCache *AC) {
+bool LowerSwitch::LowerSwitches(Function &F, LazyValueInfo *LVI,
+                                AssumptionCache *AC) {
   bool Changed = false;
   SmallPtrSet<BasicBlock *, 8> DeleteList;
 
@@ -552,8 +629,10 @@
 public:
   // Pass identification, replacement for typeid
   static char ID;
+  bool LoopUnswitch;
 
-  LowerSwitchLegacyPass() : FunctionPass(ID) {
+  LowerSwitchLegacyPass(bool LoopUnswitch = false)
+      : FunctionPass(ID), LoopUnswitch(LoopUnswitch) {
     initializeLowerSwitchLegacyPassPass(*PassRegistry::getPassRegistry());
   }
 
@@ -561,6 +640,8 @@
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<LazyValueInfoWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
   }
 };
 
@@ -575,6 +656,8 @@
                       "Lower SwitchInst's to branches", false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_END(LowerSwitchLegacyPass, "lowerswitch",
                     "Lower SwitchInst's to branches", false, false)
 
@@ -585,15 +668,19 @@
 
 bool LowerSwitchLegacyPass::runOnFunction(Function &F) {
   LazyValueInfo *LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
+  auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   auto *ACT = getAnalysisIfAvailable<AssumptionCacheTracker>();
   AssumptionCache *AC = ACT ? &ACT->getAssumptionCache(F) : nullptr;
-  return LowerSwitch(F, LVI, AC);
+  LowerSwitch LS = LowerSwitch(LI, LoopUnswitch);
+  return LS.LowerSwitches(F, LVI, AC);
 }
 
 PreservedAnalyses LowerSwitchPass::run(Function &F,
                                        FunctionAnalysisManager &AM) {
   LazyValueInfo *LVI = &AM.getResult<LazyValueAnalysis>(F);
+  LoopInfo *LI = &AM.getResult<LoopAnalysis>(F);
   AssumptionCache *AC = AM.getCachedResult<AssumptionAnalysis>(F);
-  return LowerSwitch(F, LVI, AC) ? PreservedAnalyses::none()
-                                 : PreservedAnalyses::all();
+  LowerSwitch LS = LowerSwitch(LI, LoopUnswitch);
+  return LS.LowerSwitches(F, LVI, AC) ? PreservedAnalyses::none()
+                                      : PreservedAnalyses::all();
 }
Index: llvm/lib/Transforms/Utils/FixIrreducible.cpp
===================================================================
--- llvm/lib/Transforms/Utils/FixIrreducible.cpp
+++ llvm/lib/Transforms/Utils/FixIrreducible.cpp
@@ -90,8 +90,6 @@
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<LoopInfoWrapperPass>();
     AU.addPreservedID(LowerSwitchID);
-    AU.addPreserved<DominatorTreeWrapperPass>();
-    AU.addPreserved<LoopInfoWrapperPass>();
   }
 
   bool runOnFunction(Function &F) override;
Index: llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
===================================================================
--- llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -348,7 +348,6 @@
     AU.addRequiredID(LowerSwitchID);
     AU.addRequired<DominatorTreeWrapperPass>();
 
-    AU.addPreserved<DominatorTreeWrapperPass>();
     RegionPass::getAnalysisUsage(AU);
   }
 };
Index: llvm/lib/Passes/PassBuilder.cpp
===================================================================
--- llvm/lib/Passes/PassBuilder.cpp
+++ llvm/lib/Passes/PassBuilder.cpp
@@ -1203,6 +1203,7 @@
 /// TODO: Should LTO cause any differences to this set of passes?
 void PassBuilder::addVectorPasses(OptimizationLevel Level,
                                   FunctionPassManager &FPM, bool IsFullLTO) {
+  FPM.addPass(LowerSwitchPass(true));
   FPM.addPass(LoopVectorizePass(
       LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization)));
 
Index: llvm/include/llvm/Transforms/Utils/LowerSwitch.h
===================================================================
--- llvm/include/llvm/Transforms/Utils/LowerSwitch.h
+++ llvm/include/llvm/Transforms/Utils/LowerSwitch.h
@@ -18,7 +18,13 @@
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
+class LoopInfo;
+
 struct LowerSwitchPass : public PassInfoMixin<LowerSwitchPass> {
+  bool LoopUnswitch;
+  LoopInfo *LI = nullptr;
+  LowerSwitchPass() : LoopUnswitch(false) {}
+  LowerSwitchPass(bool LoopUnswitch) : LoopUnswitch(LoopUnswitch) {}
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 } // namespace llvm
Index: clang/test/Frontend/optimization-remark-analysis.c
===================================================================
--- clang/test/Frontend/optimization-remark-analysis.c
+++ clang/test/Frontend/optimization-remark-analysis.c
@@ -1,8 +1,8 @@
 // RUN: %clang -O1 -fvectorize -target x86_64-unknown-unknown -emit-llvm -Rpass-analysis -S %s -o - 2>&1 | FileCheck %s --check-prefix=RPASS
 // RUN: %clang -O1 -fvectorize -target x86_64-unknown-unknown -emit-llvm -S %s -o - 2>&1 | FileCheck %s
 
-// RPASS: {{.*}}:7:8: remark: loop not vectorized: loop contains a switch statement
-// CHECK-NOT: {{.*}}:7:8: remark: loop not vectorized: loop contains a switch statement
+// RPASS: {{.*}}:7:8: remark: loop not vectorized: value that could not be identified as reduction is used outside the loop
+// CHECK-NOT: {{.*}}:7:8: remark: loop not vectorized: value that could not be identified as reduction is used outside the loop
 
 double foo(int N, int *Array) {
   double v = 0.0;

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D108138: [WIP] Remove switch statements before vectorization

Reply via email to