nhaehnle created this revision.
nhaehnle added a reviewer: arsenm.
Herald added subscribers: cfe-commits, kerbowa, jfb, hiraditya, t-tye, tpr, 
dstuttard, yaxunl, jvesely, kzhuravl.
Herald added projects: clang, LLVM.
nhaehnle requested review of this revision.
Herald added a subscriber: wdng.

These intrinsics should work at least with standard integer and floating
point sizes, pointers, and vectors of those.

This fixes selection for non-s32 types when readfirstlane is inserted
for SGPR return values.

Moving the atomic optimizer pass in the pass pipeline so that it can be
simplified and rely on the more general support of lane intrinsics.

API users should move to these new intrinsics so that we can remove the
old versions.

Change-Id: I1c5e7e7858890e1c30d3b46c8551e74ab7027552
Based-on: https://reviews.llvm.org/D84639


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D86154

Files:
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGenOpenCL/builtins-amdgcn.cl
  llvm/include/llvm/IR/IntrinsicsAMDGPU.td
  llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
  llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
  llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
  llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
  llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
  llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
  llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
  llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
  llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
  llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
  llvm/lib/Target/AMDGPU/SIISelLowering.cpp
  llvm/lib/Target/AMDGPU/SIInstructions.td
  llvm/lib/Target/AMDGPU/VOP1Instructions.td
  llvm/lib/Target/AMDGPU/VOP2Instructions.td
  llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.readfirstlane.mir
  llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll
  llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll
  llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readfirstlane.mir
  llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readlane.mir
  llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
  llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.writelane.mir
  llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
  llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
  llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
  llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll

Index: llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
===================================================================
--- llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
+++ llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
@@ -2507,8 +2507,8 @@
 
 define amdgpu_kernel void @readfirstlane_constant(i32 %arg) {
 ; CHECK-LABEL: @readfirstlane_constant(
-; CHECK-NEXT:    [[VAR:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
-; CHECK-NEXT:    store volatile i32 [[VAR]], i32* undef, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 [[ARG:%.*]])
+; CHECK-NEXT:    store volatile i32 [[TMP1]], i32* undef, align 4
 ; CHECK-NEXT:    store volatile i32 0, i32* undef, align 4
 ; CHECK-NEXT:    store volatile i32 123, i32* undef, align 4
 ; CHECK-NEXT:    store volatile i32 ptrtoint (i32* @gv to i32), i32* undef, align 4
@@ -2530,8 +2530,8 @@
 
 define i32 @readfirstlane_idempotent(i32 %arg) {
 ; CHECK-LABEL: @readfirstlane_idempotent(
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
-; CHECK-NEXT:    ret i32 [[READ0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 [[ARG:%.*]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)
   %read1 = call i32 @llvm.amdgcn.readfirstlane(i32 %read0)
@@ -2541,8 +2541,8 @@
 
 define i32 @readfirstlane_readlane(i32 %arg) {
 ; CHECK-LABEL: @readfirstlane_readlane(
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
-; CHECK-NEXT:    ret i32 [[READ0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 [[ARG:%.*]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)
   %read1 = call i32 @llvm.amdgcn.readlane(i32 %read0, i32 0)
@@ -2552,11 +2552,11 @@
 define i32 @readfirstlane_readfirstlane_different_block(i32 %arg) {
 ; CHECK-LABEL: @readfirstlane_readfirstlane_different_block(
 ; CHECK-NEXT:  bb0:
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 [[ARG:%.*]])
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[READ0]])
-; CHECK-NEXT:    ret i32 [[READ1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 [[TMP0]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
 bb0:
   %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)
@@ -2570,11 +2570,11 @@
 define i32 @readfirstlane_readlane_different_block(i32 %arg) {
 ; CHECK-LABEL: @readfirstlane_readlane_different_block(
 ; CHECK-NEXT:  bb0:
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 0)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.readlane2.i32(i32 [[ARG:%.*]], i32 0)
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[READ0]])
-; CHECK-NEXT:    ret i32 [[READ1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 [[TMP0]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
 bb0:
   %read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 0)
@@ -2585,6 +2585,41 @@
   ret i32 %read1
 }
 
+define i32 @readfirstlane_bitcast(float %arg) {
+; CHECK-LABEL: @readfirstlane_bitcast(
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.amdgcn.readfirstlane2.f32(float [[ARG:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %bitcast.arg = bitcast float %arg to i32
+  %read = call i32 @llvm.amdgcn.readfirstlane(i32 %bitcast.arg)
+  ret i32 %read
+}
+
+define float @bitcast_readfirstlane_bitcast(float %arg) {
+; CHECK-LABEL: @bitcast_readfirstlane_bitcast(
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.amdgcn.readfirstlane2.f32(float [[ARG:%.*]])
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+  %bitcast.arg = bitcast float %arg to i32
+  %read = call i32 @llvm.amdgcn.readfirstlane(i32 %bitcast.arg)
+  %cast.read = bitcast i32 %read to float
+  ret float %cast.read
+}
+
+define i32 @readfirstlane_bitcast_multi_use(float %arg) {
+; CHECK-LABEL: @readfirstlane_bitcast_multi_use(
+; CHECK-NEXT:    store float [[ARG:%.*]], float* undef, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.amdgcn.readfirstlane2.f32(float [[ARG]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %bitcast.arg = bitcast float %arg to i32
+  store i32 %bitcast.arg, i32* undef
+  %read = call i32 @llvm.amdgcn.readfirstlane(i32 %bitcast.arg)
+  ret i32 %read
+}
+
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.readlane
 ; --------------------------------------------------------------------
@@ -2593,8 +2628,8 @@
 
 define amdgpu_kernel void @readlane_constant(i32 %arg, i32 %lane) {
 ; CHECK-LABEL: @readlane_constant(
-; CHECK-NEXT:    [[VAR:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 7)
-; CHECK-NEXT:    store volatile i32 [[VAR]], i32* undef, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readlane2.i32(i32 [[ARG:%.*]], i32 7)
+; CHECK-NEXT:    store volatile i32 [[TMP1]], i32* undef, align 4
 ; CHECK-NEXT:    store volatile i32 0, i32* undef, align 4
 ; CHECK-NEXT:    store volatile i32 123, i32* undef, align 4
 ; CHECK-NEXT:    store volatile i32 ptrtoint (i32* @gv to i32), i32* undef, align 4
@@ -2616,8 +2651,8 @@
 
 define i32 @readlane_idempotent(i32 %arg, i32 %lane) {
 ; CHECK-LABEL: @readlane_idempotent(
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE:%.*]])
-; CHECK-NEXT:    ret i32 [[READ0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readlane2.i32(i32 [[ARG:%.*]], i32 [[LANE:%.*]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane)
   %read1 = call i32 @llvm.amdgcn.readlane(i32 %read0, i32 %lane)
@@ -2626,9 +2661,9 @@
 
 define i32 @readlane_idempotent_different_lanes(i32 %arg, i32 %lane0, i32 %lane1) {
 ; CHECK-LABEL: @readlane_idempotent_different_lanes(
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE0:%.*]])
-; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 [[LANE1:%.*]])
-; CHECK-NEXT:    ret i32 [[READ1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readlane2.i32(i32 [[ARG:%.*]], i32 [[LANE0:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.amdgcn.readlane2.i32(i32 [[TMP1]], i32 [[LANE1:%.*]])
+; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
   %read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane0)
   %read1 = call i32 @llvm.amdgcn.readlane(i32 %read0, i32 %lane1)
@@ -2637,8 +2672,8 @@
 
 define i32 @readlane_readfirstlane(i32 %arg) {
 ; CHECK-LABEL: @readlane_readfirstlane(
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
-; CHECK-NEXT:    ret i32 [[READ0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 [[ARG:%.*]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)
   %read1 = call i32 @llvm.amdgcn.readlane(i32 %read0, i32 0)
@@ -2648,11 +2683,11 @@
 define i32 @readlane_idempotent_different_block(i32 %arg, i32 %lane) {
 ; CHECK-LABEL: @readlane_idempotent_different_block(
 ; CHECK-NEXT:  bb0:
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE:%.*]])
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.readlane2.i32(i32 [[ARG:%.*]], i32 [[LANE:%.*]])
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 [[LANE]])
-; CHECK-NEXT:    ret i32 [[READ1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readlane2.i32(i32 [[TMP0]], i32 [[LANE]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
 bb0:
   %read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane)
@@ -2667,11 +2702,11 @@
 define i32 @readlane_readfirstlane_different_block(i32 %arg) {
 ; CHECK-LABEL: @readlane_readfirstlane_different_block(
 ; CHECK-NEXT:  bb0:
-; CHECK-NEXT:    [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]])
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 [[ARG:%.*]])
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 0)
-; CHECK-NEXT:    ret i32 [[READ1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.readlane2.i32(i32 [[TMP0]], i32 0)
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
 bb0:
   %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg)
Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -3,6 +3,11 @@
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX10 %s
 
 declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #0
+declare i8 addrspace(3)* @llvm.amdgcn.writelane2.p3i8(i8 addrspace(3)*, i32, i8 addrspace(3)*) #0
+declare i16 @llvm.amdgcn.writelane2.i16(i16, i32, i16) #0
+declare half @llvm.amdgcn.writelane2.f16(half, i32, half) #0
+declare <3 x i16> @llvm.amdgcn.writelane2.v3i16(<3 x i16>, i32, <3 x i16>) #0
+declare <9 x float> @llvm.amdgcn.writelane2.v9f32(<9 x float>, i32, <9 x float>) #0
 
 ; CHECK-LABEL: {{^}}test_writelane_sreg:
 ; CIGFX9: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, m0
@@ -79,6 +84,60 @@
   ret void
 }
 
+; CHECK-LABEL: {{^}}test_writelane_p3:
+; CHECK: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-NOT: v_writelane_b32
+define void @test_writelane_p3(i8 addrspace(3)* addrspace(1)* %out, i8 addrspace(3)* %src) #1 {
+  %writelane = call i8 addrspace(3)* @llvm.amdgcn.writelane2.p3i8(i8 addrspace(3)* null, i32 15, i8 addrspace(3)* %src)
+  store i8 addrspace(3)* %writelane, i8 addrspace(3)* addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_i16:
+; CHECK: v_writelane_b32 v{{[0-9]+}},
+; CHECK-NOT: v_writelane_b32
+define void @test_writelane_i16(i16 addrspace(1)* %out, i16 %src) {
+  %writelane = call i16 @llvm.amdgcn.writelane2.i16(i16 1234, i32 15, i16 %src)
+  store i16 %writelane, i16 addrspace(1)* %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_f16:
+; CHECK: v_writelane_b32 v{{[0-9]+}},
+; CHECK-NOT: v_writelane_b32
+define void @test_writelane_f16(half addrspace(1)* %out, half %src) {
+  %writelane = call half @llvm.amdgcn.writelane2.f16(half 1.0, i32 15, half %src)
+  store half %writelane, half addrspace(1)* %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_v3i16:
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}},
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}},
+; CHECK-NOT: v_writelane_b32
+define void @test_writelane_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %src) {
+  %writelane = call <3 x i16> @llvm.amdgcn.writelane2.v3i16(<3 x i16> zeroinitializer, i32 15, <3 x i16> %src)
+  store <3 x i16> %writelane, <3 x i16> addrspace(1)* %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_v9f32:
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-DAG: v_writelane_b32 v{{[0-9]+}}, 0, 15
+; CHECK-NOT: v_writelane_b32
+define void @test_writelane_v9f32(<9 x float> addrspace(1)* %out, <9 x float> %src) {
+  %writelane = call <9 x float> @llvm.amdgcn.writelane2.v9f32(<9 x float> zeroinitializer, i32 15, <9 x float> %src)
+  store <9 x float> %writelane, <9 x float> addrspace(1)* %out, align 2
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #2
 
 attributes #0 = { nounwind readnone convergent }
Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -1,6 +1,11 @@
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope %s
 
 declare i32 @llvm.amdgcn.readlane(i32, i32) #0
+declare i8 addrspace(3)* @llvm.amdgcn.readlane2.p3i8(i8 addrspace(3)*, i32) #0
+declare i16 @llvm.amdgcn.readlane2.i16(i16, i32) #0
+declare half @llvm.amdgcn.readlane2.f16(half, i32) #0
+declare <3 x i16> @llvm.amdgcn.readlane2.v3i16(<3 x i16>, i32) #0
+declare <9 x float> @llvm.amdgcn.readlane2.v9f32(<9 x float>, i32) #0
 
 ; CHECK-LABEL: {{^}}test_readlane_sreg_sreg:
 ; CHECK-NOT: v_readlane_b32
@@ -77,6 +82,60 @@
   ret void
 }
 
+; CHECK-LABEL: {{^}}test_readlane_p3:
+; CHECK: v_readlane_b32 s{{[0-9]+}}, v2, 15
+; CHECK-NOT: v_readlane_b32
+define void @test_readlane_p3(i8 addrspace(3)* addrspace(1)* %out, i8 addrspace(3)* %src) #1 {
+  %readlane = call i8 addrspace(3)* @llvm.amdgcn.readlane2.p3i8(i8 addrspace(3)* %src, i32 15)
+  store i8 addrspace(3)* %readlane, i8 addrspace(3)* addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readlane_i16:
+; CHECK: v_readlane_b32 s{{[0-9]+}}, v2, 15
+; CHECK-NOT: v_readlane_b32
+define void @test_readlane_i16(i16 addrspace(1)* %out, i16 %src) {
+  %readlane = call i16 @llvm.amdgcn.readlane2.i16(i16 %src, i32 15)
+  store i16 %readlane, i16 addrspace(1)* %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readlane_f16:
+; CHECK: v_readlane_b32 s{{[0-9]+}}, v2, 15
+; CHECK-NOT: v_readlane_b32
+define void @test_readlane_f16(half addrspace(1)* %out, half %src) {
+  %readlane = call half @llvm.amdgcn.readlane2.f16(half %src, i32 15)
+  store half %readlane, half addrspace(1)* %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readlane_v3i16:
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}},
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}},
+; CHECK-NOT: v_readlane_b32
+define void @test_readlane_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %src) {
+  %readlane = call <3 x i16> @llvm.amdgcn.readlane2.v3i16(<3 x i16> %src, i32 15)
+  store <3 x i16> %readlane, <3 x i16> addrspace(1)* %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readlane_v9f32:
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v2, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v3, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v4, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v5, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v6, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v7, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v8, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v9, 15
+; CHECK-DAG: v_readlane_b32 s{{[0-9]+}}, v10, 15
+; CHECK-NOT: v_readlane_b32
+define void @test_readlane_v9f32(<9 x float> addrspace(1)* %out, <9 x float> %src) {
+  %readlane = call <9 x float> @llvm.amdgcn.readlane2.v9f32(<9 x float> %src, i32 15)
+  store <9 x float> %readlane, <9 x float> addrspace(1)* %out, align 2
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #2
 
 attributes #0 = { nounwind readnone convergent }
Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -1,11 +1,20 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope %s
 
-declare i32 @llvm.amdgcn.readfirstlane(i32) #0
+declare i32 @llvm.amdgcn.readfirstlane2.i32(i32) #0
+declare float @llvm.amdgcn.readfirstlane2.f32(float) #0
+declare <2 x half> @llvm.amdgcn.readfirstlane2.v2f16(<2 x half>) #0
+declare <2 x i16> @llvm.amdgcn.readfirstlane2.v2i16(<2 x i16>) #0
+declare i8 addrspace(3)* @llvm.amdgcn.readfirstlane2.p3i8(i8 addrspace(3)*) #0
+declare i16 @llvm.amdgcn.readfirstlane2.i16(i16) #0
+declare half @llvm.amdgcn.readfirstlane2.f16(half) #0
+declare <3 x i16> @llvm.amdgcn.readfirstlane2.v3i16(<3 x i16>) #0
+declare <9 x float> @llvm.amdgcn.readfirstlane2.v9f32(<9 x float>) #0
 
-; CHECK-LABEL: {{^}}test_readfirstlane:
+; CHECK-LABEL: {{^}}test_readfirstlane_i32:
 ; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
-define void @test_readfirstlane(i32 addrspace(1)* %out, i32 %src) #1 {
-  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %src)
+define void @test_readfirstlane_i32(i32 addrspace(1)* %out, i32 %src) #1 {
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 %src)
   store i32 %readfirstlane, i32 addrspace(1)* %out, align 4
   ret void
 }
@@ -15,7 +24,7 @@
 ; CHECK-NOT: [[SGPR_VAL]]
 ; CHECK: ; use [[SGPR_VAL]]
 define amdgpu_kernel void @test_readfirstlane_imm(i32 addrspace(1)* %out) #1 {
-  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32)
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 32)
   call void asm sideeffect "; use $0", "s"(i32 %readfirstlane)
   ret void
 }
@@ -25,7 +34,7 @@
 ; CHECK-NOT: [[VVAL]]
 ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]]
 define amdgpu_kernel void @test_readfirstlane_imm_fold(i32 addrspace(1)* %out) #1 {
-  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32)
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 32)
   store i32 %readfirstlane, i32 addrspace(1)* %out, align 4
   ret void
 }
@@ -36,7 +45,7 @@
 ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]]
 define amdgpu_kernel void @test_readfirstlane_m0(i32 addrspace(1)* %out) #1 {
   %m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"()
-  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %m0)
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 %m0)
   store i32 %readfirstlane, i32 addrspace(1)* %out, align 4
   ret void
 }
@@ -51,7 +60,7 @@
 ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VCOPY]]
 define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr(i32 addrspace(1)* %out) #1 {
   %sgpr = call i32 asm "s_mov_b32 $0, 0", "=s"()
-  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %sgpr)
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 %sgpr)
   store i32 %readfirstlane, i32 addrspace(1)* %out, align 4
   ret void
 }
@@ -62,10 +71,91 @@
 define amdgpu_kernel void @test_readfirstlane_fi(i32 addrspace(1)* %out) #1 {
   %alloca = alloca i32, addrspace(5)
   %int = ptrtoint i32 addrspace(5)* %alloca to i32
-  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %int)
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 %int)
   call void asm sideeffect "; use $0", "s"(i32 %readfirstlane)
   ret void
 }
 
+; CHECK-LABEL: {{^}}test_readfirstlane_f32:
+; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_f32(float addrspace(1)* %out, float %src) #1 {
+  %readfirstlane = call float @llvm.amdgcn.readfirstlane2.f32(float %src)
+  store float %readfirstlane, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_v2f16:
+; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %src) #1 {
+  %readfirstlane = call <2 x half> @llvm.amdgcn.readfirstlane2.v2f16(<2 x half> %src)
+  store <2 x half> %readfirstlane, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_v2i16:
+; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %src) #1 {
+  %readfirstlane = call <2 x i16> @llvm.amdgcn.readfirstlane2.v2i16(<2 x i16> %src)
+  store <2 x i16> %readfirstlane, <2 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_p3:
+; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_p3(i8 addrspace(3)* addrspace(1)* %out, i8 addrspace(3)* %src) #1 {
+  %readfirstlane = call i8 addrspace(3)* @llvm.amdgcn.readfirstlane2.p3i8(i8 addrspace(3)* %src)
+  store i8 addrspace(3)* %readfirstlane, i8 addrspace(3)* addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_i16:
+; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_i16(i16 addrspace(1)* %out, i16 %src) {
+  %readfirstlane = call i16 @llvm.amdgcn.readfirstlane2.i16(i16 %src)
+  store i16 %readfirstlane, i16 addrspace(1)* %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_f16:
+; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_f16(half addrspace(1)* %out, half %src) {
+  %readfirstlane = call half @llvm.amdgcn.readfirstlane2.f16(half %src)
+  store half %readfirstlane, half addrspace(1)* %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_v3i16:
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}},
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}},
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %src) {
+  %readfirstlane = call <3 x i16> @llvm.amdgcn.readfirstlane2.v3i16(<3 x i16> %src)
+  store <3 x i16> %readfirstlane, <3 x i16> addrspace(1)* %out, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_readfirstlane_v9f32:
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v2
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v3
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v4
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v5
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v6
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v7
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v8
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v9
+; CHECK-DAG: v_readfirstlane_b32 s{{[0-9]+}}, v10
+; CHECK-NOT: v_readfirstlane_b32
+define void @test_readfirstlane_v9f32(<9 x float> addrspace(1)* %out, <9 x float> %src) {
+  %readfirstlane = call <9 x float> @llvm.amdgcn.readfirstlane2.v9f32(<9 x float> %src)
+  store <9 x float> %readfirstlane, <9 x float> addrspace(1)* %out, align 2
+  ret void
+}
+
 attributes #0 = { nounwind readnone convergent }
 attributes #1 = { nounwind }
Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.writelane.mir
===================================================================
--- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.writelane.mir
+++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.writelane.mir
@@ -14,11 +14,11 @@
     ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
     ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
     ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32)
-    ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane), [[COPY]](s32), [[COPY1]](s32), [[COPY3]](s32)
+    ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane2), [[COPY]](s32), [[COPY1]](s32), [[COPY3]](s32)
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $sgpr1
     %2:_(s32) = COPY $sgpr2
-    %3:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane), %0, %1, %2
+    %3:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane2), %0, %1, %2
 ...
 
 ---
@@ -32,11 +32,11 @@
     ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
     ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
-    ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane), [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+    ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane2), [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $sgpr1
     %2:_(s32) = COPY $vgpr0
-    %3:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane), %0, %1, %2
+    %3:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane2), %0, %1, %2
 ...
 
 ---
@@ -51,11 +51,11 @@
     ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
     ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
     ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec
-    ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane), [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), [[COPY2]](s32)
+    ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane2), [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), [[COPY2]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $sgpr0
     %2:_(s32) = COPY $vgpr1
-    %3:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane), %0, %1, %2
+    %3:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane2), %0, %1, %2
 ...
 
 ---
@@ -71,11 +71,11 @@
     ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
     ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec
     ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
-    ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane), [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[COPY2]](s32)
+    ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane2), [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[COPY2]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s32) = COPY $vgpr2
-    %3:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane), %0, %1, %2
+    %3:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane2), %0, %1, %2
 ...
 
 ---
@@ -90,9 +90,9 @@
     ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
     ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
     ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
-    ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane), [[COPY]](s32), [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32)
+    ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane2), [[COPY]](s32), [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32)
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $vgpr0
     %2:_(s32) = COPY $vgpr1
-    %3:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane), %0, %1, %2
+    %3:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.writelane2), %0, %1, %2
 ...
Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
+++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
@@ -15,7 +15,7 @@
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(s32) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 4)
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[AMDGPU_S_BUFFER_LOAD]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
+  ; CHECK:   [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY5]](s32)
   ; CHECK:   $sgpr0 = COPY [[INT]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $sgpr0
   %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
@@ -35,10 +35,10 @@
   ; CHECK:   [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 8, align 4)
   ; CHECK:   [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<2 x s32>)
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
+  ; CHECK:   [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY5]](s32)
   ; CHECK:   $sgpr0 = COPY [[INT]](s32)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32)
-  ; CHECK:   [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
+  ; CHECK:   [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY6]](s32)
   ; CHECK:   $sgpr1 = COPY [[INT1]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
   %val = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
@@ -61,13 +61,13 @@
   ; CHECK:   [[UV:%[0-9]+]]:sgpr(<3 x s32>), [[UV1:%[0-9]+]]:sgpr(<3 x s32>), [[UV2:%[0-9]+]]:sgpr(<3 x s32>), [[UV3:%[0-9]+]]:sgpr(<3 x s32>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<12 x s32>)
   ; CHECK:   [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[UV]](<3 x s32>)
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV4]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
+  ; CHECK:   [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY5]](s32)
   ; CHECK:   $sgpr0 = COPY [[INT]](s32)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV5]](s32)
-  ; CHECK:   [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
+  ; CHECK:   [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY6]](s32)
   ; CHECK:   $sgpr1 = COPY [[INT1]](s32)
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV6]](s32)
-  ; CHECK:   [[INT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32)
+  ; CHECK:   [[INT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY7]](s32)
   ; CHECK:   $sgpr2 = COPY [[INT2]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2
   %val = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
@@ -87,28 +87,28 @@
   ; CHECK:   [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 32, align 4)
   ; CHECK:   [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<8 x s32>)
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
+  ; CHECK:   [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY5]](s32)
   ; CHECK:   $sgpr0 = COPY [[INT]](s32)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32)
-  ; CHECK:   [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
+  ; CHECK:   [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY6]](s32)
   ; CHECK:   $sgpr1 = COPY [[INT1]](s32)
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32)
-  ; CHECK:   [[INT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32)
+  ; CHECK:   [[INT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY7]](s32)
   ; CHECK:   $sgpr2 = COPY [[INT2]](s32)
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[UV3]](s32)
-  ; CHECK:   [[INT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY8]](s32)
+  ; CHECK:   [[INT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY8]](s32)
   ; CHECK:   $sgpr3 = COPY [[INT3]](s32)
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[UV4]](s32)
-  ; CHECK:   [[INT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY9]](s32)
+  ; CHECK:   [[INT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY9]](s32)
   ; CHECK:   $sgpr4 = COPY [[INT4]](s32)
   ; CHECK:   [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[UV5]](s32)
-  ; CHECK:   [[INT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY10]](s32)
+  ; CHECK:   [[INT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY10]](s32)
   ; CHECK:   $sgpr5 = COPY [[INT5]](s32)
   ; CHECK:   [[COPY11:%[0-9]+]]:vgpr(s32) = COPY [[UV6]](s32)
-  ; CHECK:   [[INT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY11]](s32)
+  ; CHECK:   [[INT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY11]](s32)
   ; CHECK:   $sgpr6 = COPY [[INT6]](s32)
   ; CHECK:   [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[UV7]](s32)
-  ; CHECK:   [[INT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY12]](s32)
+  ; CHECK:   [[INT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY12]](s32)
   ; CHECK:   $sgpr7 = COPY [[INT7]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7
   %val = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
@@ -128,52 +128,52 @@
   ; CHECK:   [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 64, align 4)
   ; CHECK:   [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32), [[UV8:%[0-9]+]]:sgpr(s32), [[UV9:%[0-9]+]]:sgpr(s32), [[UV10:%[0-9]+]]:sgpr(s32), [[UV11:%[0-9]+]]:sgpr(s32), [[UV12:%[0-9]+]]:sgpr(s32), [[UV13:%[0-9]+]]:sgpr(s32), [[UV14:%[0-9]+]]:sgpr(s32), [[UV15:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<16 x s32>)
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
+  ; CHECK:   [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY5]](s32)
   ; CHECK:   $sgpr0 = COPY [[INT]](s32)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32)
-  ; CHECK:   [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
+  ; CHECK:   [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY6]](s32)
   ; CHECK:   $sgpr1 = COPY [[INT1]](s32)
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32)
-  ; CHECK:   [[INT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32)
+  ; CHECK:   [[INT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY7]](s32)
   ; CHECK:   $sgpr2 = COPY [[INT2]](s32)
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[UV3]](s32)
-  ; CHECK:   [[INT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY8]](s32)
+  ; CHECK:   [[INT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY8]](s32)
   ; CHECK:   $sgpr3 = COPY [[INT3]](s32)
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[UV4]](s32)
-  ; CHECK:   [[INT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY9]](s32)
+  ; CHECK:   [[INT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY9]](s32)
   ; CHECK:   $sgpr4 = COPY [[INT4]](s32)
   ; CHECK:   [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[UV5]](s32)
-  ; CHECK:   [[INT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY10]](s32)
+  ; CHECK:   [[INT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY10]](s32)
   ; CHECK:   $sgpr5 = COPY [[INT5]](s32)
   ; CHECK:   [[COPY11:%[0-9]+]]:vgpr(s32) = COPY [[UV6]](s32)
-  ; CHECK:   [[INT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY11]](s32)
+  ; CHECK:   [[INT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY11]](s32)
   ; CHECK:   $sgpr6 = COPY [[INT6]](s32)
   ; CHECK:   [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[UV7]](s32)
-  ; CHECK:   [[INT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY12]](s32)
+  ; CHECK:   [[INT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY12]](s32)
   ; CHECK:   $sgpr7 = COPY [[INT7]](s32)
   ; CHECK:   [[COPY13:%[0-9]+]]:vgpr(s32) = COPY [[UV8]](s32)
-  ; CHECK:   [[INT8:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY13]](s32)
+  ; CHECK:   [[INT8:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY13]](s32)
   ; CHECK:   $sgpr8 = COPY [[INT8]](s32)
   ; CHECK:   [[COPY14:%[0-9]+]]:vgpr(s32) = COPY [[UV9]](s32)
-  ; CHECK:   [[INT9:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY14]](s32)
+  ; CHECK:   [[INT9:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY14]](s32)
   ; CHECK:   $sgpr9 = COPY [[INT9]](s32)
   ; CHECK:   [[COPY15:%[0-9]+]]:vgpr(s32) = COPY [[UV10]](s32)
-  ; CHECK:   [[INT10:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY15]](s32)
+  ; CHECK:   [[INT10:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY15]](s32)
   ; CHECK:   $sgpr10 = COPY [[INT10]](s32)
   ; CHECK:   [[COPY16:%[0-9]+]]:vgpr(s32) = COPY [[UV11]](s32)
-  ; CHECK:   [[INT11:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY16]](s32)
+  ; CHECK:   [[INT11:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY16]](s32)
   ; CHECK:   $sgpr11 = COPY [[INT11]](s32)
   ; CHECK:   [[COPY17:%[0-9]+]]:vgpr(s32) = COPY [[UV12]](s32)
-  ; CHECK:   [[INT12:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY17]](s32)
+  ; CHECK:   [[INT12:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY17]](s32)
   ; CHECK:   $sgpr12 = COPY [[INT12]](s32)
   ; CHECK:   [[COPY18:%[0-9]+]]:vgpr(s32) = COPY [[UV13]](s32)
-  ; CHECK:   [[INT13:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY18]](s32)
+  ; CHECK:   [[INT13:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY18]](s32)
   ; CHECK:   $sgpr13 = COPY [[INT13]](s32)
   ; CHECK:   [[COPY19:%[0-9]+]]:vgpr(s32) = COPY [[UV14]](s32)
-  ; CHECK:   [[INT14:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY19]](s32)
+  ; CHECK:   [[INT14:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY19]](s32)
   ; CHECK:   $sgpr14 = COPY [[INT14]](s32)
   ; CHECK:   [[COPY20:%[0-9]+]]:vgpr(s32) = COPY [[UV15]](s32)
-  ; CHECK:   [[INT15:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY20]](s32)
+  ; CHECK:   [[INT15:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY20]](s32)
   ; CHECK:   $sgpr15 = COPY [[INT15]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15
   %val = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readlane.mir
===================================================================
--- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readlane.mir
+++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readlane.mir
@@ -13,10 +13,10 @@
     ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
     ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
-    ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane), [[COPY2]](s32), [[COPY1]](s32)
+    ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane2), [[COPY2]](s32), [[COPY1]](s32)
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $sgpr1
-    %2:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane), %0, %1
+    %2:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane2), %0, %1
 ...
 
 ---
@@ -29,10 +29,10 @@
     ; CHECK-LABEL: name: readlane_vs
     ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
-    ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane), [[COPY]](s32), [[COPY1]](s32)
+    ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane2), [[COPY]](s32), [[COPY1]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $sgpr0
-    %2:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane), %0, %1
+    %2:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane2), %0, %1
 ...
 
 ---
@@ -46,10 +46,10 @@
     ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
     ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
-    ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane), [[COPY]](s32), [[V_READFIRSTLANE_B32_]](s32)
+    ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane2), [[COPY]](s32), [[V_READFIRSTLANE_B32_]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
-    %2:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane), %0, %1
+    %2:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane2), %0, %1
 ...
 
 ---
@@ -64,8 +64,8 @@
     ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
     ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
     ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
-    ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane), [[COPY2]](s32), [[V_READFIRSTLANE_B32_]](s32)
+    ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane2), [[COPY2]](s32), [[V_READFIRSTLANE_B32_]](s32)
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $vgpr0
-    %2:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane), %0, %1
+    %2:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readlane2), %0, %1
 ...
Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readfirstlane.mir
===================================================================
--- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readfirstlane.mir
+++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readfirstlane.mir
@@ -12,9 +12,9 @@
     ; CHECK-LABEL: name: readfirstlane_s
     ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
     ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
-    ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY1]](s32)
+    ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY1]](s32)
     %0:_(s32) = COPY $sgpr0
-    %1:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), %0
+    %1:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), %0
 ...
 
 ---
@@ -26,7 +26,7 @@
     liveins: $vgpr0
     ; CHECK-LABEL: name: readfirstlane_v
     ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
-    ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY]](s32)
+    ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY]](s32)
     %0:_(s32) = COPY $vgpr0
-    %1:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), %0
+    %1:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), %0
 ...
Index: llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll
+++ llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll
@@ -76,9 +76,9 @@
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
   ; CHECK:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK:   [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY]](s32)
+  ; CHECK:   [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY]](s32)
   ; CHECK:   $sgpr0 = COPY [[INT]](s32)
-  ; CHECK:   [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY1]](s32)
+  ; CHECK:   [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY1]](s32)
   ; CHECK:   $sgpr1 = COPY [[INT1]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
 main_body:
@@ -91,7 +91,7 @@
   ; CHECK-LABEL: name: non_void_ret
   ; CHECK: bb.1 (%ir-block.0):
   ; CHECK:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-  ; CHECK:   [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[C]](s32)
+  ; CHECK:   [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[C]](s32)
   ; CHECK:   $sgpr0 = COPY [[INT]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $sgpr0
   ret i32 0
Index: llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll
+++ llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll
@@ -47,7 +47,7 @@
   ; CHECK: bb.1 (%ir-block.0):
   ; CHECK:   liveins: $vgpr0
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; CHECK:   [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY]](s32)
+  ; CHECK:   [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY]](s32)
   ; CHECK:   $sgpr0 = COPY [[INT]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $sgpr0
   ret i32 %vgpr
@@ -61,9 +61,9 @@
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64)
-  ; CHECK:   [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
+  ; CHECK:   [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[UV]](s32)
   ; CHECK:   $sgpr0 = COPY [[INT]](s32)
-  ; CHECK:   [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
+  ; CHECK:   [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[UV1]](s32)
   ; CHECK:   $sgpr1 = COPY [[INT1]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
   ret i64 %vgpr
@@ -77,9 +77,9 @@
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x s32>)
-  ; CHECK:   [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
+  ; CHECK:   [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[UV]](s32)
   ; CHECK:   $sgpr0 = COPY [[INT]](s32)
-  ; CHECK:   [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
+  ; CHECK:   [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[UV1]](s32)
   ; CHECK:   $sgpr1 = COPY [[INT1]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
   ret <2 x i32> %vgpr
@@ -92,9 +92,9 @@
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; CHECK:   [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY]](s32)
+  ; CHECK:   [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY]](s32)
   ; CHECK:   $sgpr0 = COPY [[INT]](s32)
-  ; CHECK:   [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY1]](s32)
+  ; CHECK:   [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY1]](s32)
   ; CHECK:   $sgpr1 = COPY [[INT1]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
   %insertvalue0 = insertvalue { i32, i32 } undef, i32 %vgpr0, 0
Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.readfirstlane.mir
===================================================================
--- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.readfirstlane.mir
+++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.readfirstlane.mir
@@ -1,11 +1,11 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o - 2> %t | FileCheck -check-prefix=GCN %s
+# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel.*' %s -o - 2> %t | FileCheck -check-prefix=GCN %s
 # RUN: FileCheck -check-prefix=ERR %s < %t
 
-# ERR: remark: <unknown>:0:0: cannot select: %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), %0:sgpr(s32) (in function: readfirstlane_s)
+# ERR: remark: <unknown>:0:0: cannot select: %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), %0:sgpr(s32) (in function: readfirstlane_s32_s)
 
 ---
-name: readfirstlane_v
+name: readfirstlane_s32_v
 legalized: true
 regBankSelected: true
 tracksRegLiveness: true
@@ -13,18 +13,18 @@
 body: |
   bb.0:
     liveins: $vgpr0
-    ; GCN-LABEL: name: readfirstlane_v
+    ; GCN-LABEL: name: readfirstlane_s32_v
     ; GCN: liveins: $vgpr0
     ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
     ; GCN: S_ENDPGM 0, implicit [[V_READFIRSTLANE_B32_]]
     %0:vgpr(s32) = COPY $vgpr0
-    %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), %0
+    %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), %0
     S_ENDPGM 0, implicit %1
 ...
 
 ---
-name: readfirstlane_v_imm
+name: readfirstlane_v_s32_imm
 legalized: true
 regBankSelected: true
 tracksRegLiveness: true
@@ -32,19 +32,19 @@
 body: |
   bb.0:
 
-    ; GCN-LABEL: name: readfirstlane_v_imm
+    ; GCN-LABEL: name: readfirstlane_v_s32_imm
     ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 123, implicit $exec
     ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY [[V_MOV_B32_e32_]]
     ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 [[COPY]]
     ; GCN: S_ENDPGM 0, implicit [[S_MOV_B32_]]
     %0:vgpr(s32) = G_CONSTANT i32 123
-    %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), %0
+    %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), %0
     S_ENDPGM 0, implicit %1
 ...
 
 # Make sure this fails to select
 ---
-name: readfirstlane_s
+name: readfirstlane_s32_s
 legalized: true
 regBankSelected: true
 tracksRegLiveness: true
@@ -52,12 +52,12 @@
 body: |
   bb.0:
     liveins: $sgpr0
-    ; GCN-LABEL: name: readfirstlane_s
+    ; GCN-LABEL: name: readfirstlane_s32_s
     ; GCN: liveins: $sgpr0
     ; GCN: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
-    ; GCN: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY]](s32)
+    ; GCN: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), [[COPY]](s32)
     ; GCN: S_ENDPGM 0, implicit [[INT]](s32)
     %0:sgpr(s32) = COPY $sgpr0
-    %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), %0
+    %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane2), %0
     S_ENDPGM 0, implicit %1
 ...
Index: llvm/lib/Target/AMDGPU/VOP2Instructions.td
===================================================================
--- llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -525,11 +525,11 @@
 // These are special and do not read the exec mask.
 let isConvergent = 1, Uses = []<Register> in {
 def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE,
-  [(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))]>;
+  [(set i32:$vdst, (int_amdgcn_readlane2 i32:$src0, i32:$src1))]>;
 
 let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
 def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE,
-  [(set i32:$vdst, (int_amdgcn_writelane i32:$src0, i32:$src1, i32:$vdst_in))]>;
+  [(set i32:$vdst, (int_amdgcn_writelane2 i32:$src0, i32:$src1, i32:$vdst_in))]>;
 } // End $vdst = $vdst_in, DisableEncoding $vdst_in
 } // End isConvergent = 1
 
Index: llvm/lib/Target/AMDGPU/VOP1Instructions.td
===================================================================
--- llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -163,7 +163,7 @@
   InstSI <(outs SReg_32:$vdst),
     (ins VRegOrLds_32:$src0),
     "v_readfirstlane_b32 $vdst, $src0",
-    [(set i32:$vdst, (int_amdgcn_readfirstlane (i32 VRegOrLds_32:$src0)))]>,
+    [(set i32:$vdst, (int_amdgcn_readfirstlane2 (i32 VRegOrLds_32:$src0)))]>,
   Enc32 {
 
   let isCodeGenOnly = 0;
Index: llvm/lib/Target/AMDGPU/SIInstructions.td
===================================================================
--- llvm/lib/Target/AMDGPU/SIInstructions.td
+++ llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2203,7 +2203,7 @@
 // FIXME: Should also do this for readlane, but tablegen crashes on
 // the ignored src1.
 def : GCNPat<
-  (int_amdgcn_readfirstlane (i32 imm:$src)),
+  (i32 (int_amdgcn_readfirstlane2 (i32 imm:$src))),
   (S_MOV_B32 SReg_32:$src)
 >;
 
Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -802,6 +802,8 @@
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v3i16, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v3f16, Custom);
 
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2i16, Custom);
Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -843,6 +843,8 @@
     switch (Intrinsic->getIntrinsicID()) {
     default:
       return false;
+    case Intrinsic::amdgcn_readfirstlane2:
+    case Intrinsic::amdgcn_readlane2:
     case Intrinsic::amdgcn_readfirstlane:
     case Intrinsic::amdgcn_readlane:
     case Intrinsic::amdgcn_icmp:
Index: llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -715,6 +715,8 @@
 
   addPass(createAtomicExpandPass());
 
+  if (EnableAtomicOptimizations)
+    addPass(createAMDGPUAtomicOptimizerPass());
 
   addPass(createAMDGPULowerIntrinsicsPass());
 
@@ -871,10 +873,6 @@
 bool GCNPassConfig::addPreISel() {
   AMDGPUPassConfig::addPreISel();
 
-  if (EnableAtomicOptimizations) {
-    addPass(createAMDGPUAtomicOptimizerPass());
-  }
-
   // FIXME: We need to run a pass to propagate the attributes when calls are
   // supported.
 
Index: llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -251,6 +251,7 @@
 def : SourceOfDivergence<int_amdgcn_mov_dpp>;
 def : SourceOfDivergence<int_amdgcn_mov_dpp8>;
 def : SourceOfDivergence<int_amdgcn_update_dpp>;
+def : SourceOfDivergence<int_amdgcn_writelane2>;
 def : SourceOfDivergence<int_amdgcn_writelane>;
 
 def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x1f32>;
Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -334,7 +334,7 @@
 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
   switch (MI.getIntrinsicID()) {
-  case Intrinsic::amdgcn_readlane: {
+  case Intrinsic::amdgcn_readlane2: {
     static const OpRegBankEntry<3> Table[2] = {
       // Perfectly legal.
       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
@@ -346,7 +346,7 @@
     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
   }
-  case Intrinsic::amdgcn_writelane: {
+  case Intrinsic::amdgcn_writelane2: {
     static const OpRegBankEntry<4> Table[4] = {
       // Perfectly legal.
       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
@@ -2966,7 +2966,7 @@
   }
   case AMDGPU::G_INTRINSIC: {
     switch (MI.getIntrinsicID()) {
-    case Intrinsic::amdgcn_readlane: {
+    case Intrinsic::amdgcn_readlane2: {
       substituteSimpleCopyRegs(OpdMapper, 2);
 
       assert(OpdMapper.getVRegs(0).empty());
@@ -2977,7 +2977,7 @@
       constrainOpWithReadfirstlane(MI, MRI, 3); // Index
       return;
     }
-    case Intrinsic::amdgcn_writelane: {
+    case Intrinsic::amdgcn_writelane2: {
       assert(OpdMapper.getVRegs(0).empty());
       assert(OpdMapper.getVRegs(2).empty());
       assert(OpdMapper.getVRegs(3).empty());
@@ -4115,7 +4115,7 @@
       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
       break;
     }
-    case Intrinsic::amdgcn_readlane: {
+    case Intrinsic::amdgcn_readlane2: {
       // This must be an SGPR, but accept a VGPR.
       Register IdxReg = MI.getOperand(3).getReg();
       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
@@ -4123,14 +4123,14 @@
       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
       LLVM_FALLTHROUGH;
     }
-    case Intrinsic::amdgcn_readfirstlane: {
+    case Intrinsic::amdgcn_readfirstlane2: {
       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
       break;
     }
-    case Intrinsic::amdgcn_writelane: {
+    case Intrinsic::amdgcn_writelane2: {
       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
       Register SrcReg = MI.getOperand(2).getReg();
       unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -920,7 +920,7 @@
     return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
   case Intrinsic::amdgcn_wwm:
     return constrainCopyLikeIntrin(I, AMDGPU::WWM);
-  case Intrinsic::amdgcn_writelane:
+  case Intrinsic::amdgcn_writelane2:
     return selectWritelane(I);
   case Intrinsic::amdgcn_div_scale:
     return selectDivScale(I);
Index: llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -630,7 +630,26 @@
     return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
   }
   case Intrinsic::amdgcn_readfirstlane:
-  case Intrinsic::amdgcn_readlane: {
+  case Intrinsic::amdgcn_readlane:
+  case Intrinsic::amdgcn_writelane: {
+    // Canonicalize to overloaded intrinsics.
+    Intrinsic::ID NewIID;
+    if (IID == Intrinsic::amdgcn_readfirstlane)
+      NewIID = Intrinsic::amdgcn_readfirstlane2;
+    else if (IID == Intrinsic::amdgcn_readlane)
+      NewIID = Intrinsic::amdgcn_readlane2;
+    else
+      NewIID = Intrinsic::amdgcn_writelane2;
+
+    SmallVector<Value *, 3> Args;
+    for (Value *Arg : II.arg_operands())
+      Args.push_back(Arg);
+    CallInst *UpgradedCall = IC.Builder.CreateIntrinsic(
+        NewIID, {II.getArgOperand(0)->getType()}, Args);
+    return IC.replaceInstUsesWith(II, UpgradedCall);
+  }
+  case Intrinsic::amdgcn_readfirstlane2:
+  case Intrinsic::amdgcn_readlane2: {
     // A constant value is trivially uniform.
     if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
       return IC.replaceInstUsesWith(II, C);
@@ -646,18 +665,30 @@
     // readfirstlane (readfirstlane x) -> readfirstlane x
     // readlane (readfirstlane x), y -> readfirstlane x
     if (match(Src,
-              PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
+              PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane2>())) {
       return IC.replaceInstUsesWith(II, Src);
     }
 
-    if (IID == Intrinsic::amdgcn_readfirstlane) {
+    if (IID == Intrinsic::amdgcn_readfirstlane2) {
       // readfirstlane (readlane x, y) -> readlane x, y
-      if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
+      if (match(Src,
+                PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane2>())) {
         return IC.replaceInstUsesWith(II, Src);
       }
+
+      // readfirstlane (bitcast x) -> bitcast (readfirstlane x)
+      Value *BitcastInput = nullptr;
+      if (match(Src,
+                PatternMatch::m_BitCast(PatternMatch::m_Value(BitcastInput)))) {
+        CallInst *NewCall =
+            IC.Builder.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane2,
+                                       {BitcastInput->getType()}, BitcastInput);
+        Value *NewCast = IC.Builder.CreateBitCast(NewCall, II.getType());
+        return IC.replaceInstUsesWith(II, NewCast);
+      }
     } else {
       // readlane (readlane x, y), y -> readlane x, y
-      if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
+      if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane2>(
                          PatternMatch::m_Value(),
                          PatternMatch::m_Specific(II.getArgOperand(1))))) {
         return IC.replaceInstUsesWith(II, Src);
Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -795,9 +795,12 @@
       switch (IntrID) {
         default:
         return false;
+        case Intrinsic::amdgcn_readfirstlane2:
+        case Intrinsic::amdgcn_readlane2:
+          return true;
         case Intrinsic::amdgcn_readfirstlane:
         case Intrinsic::amdgcn_readlane:
-          return true;
+          llvm_unreachable("should have been updated during CodeGenPrepare");
       }
     }
     break;
Index: llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -224,6 +224,10 @@
 
   bool visitIntrinsicInst(IntrinsicInst &I);
   bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
+  bool visitLaneIntrinsicInst(IntrinsicInst &I, Intrinsic::ID CanonicalIID);
+  Value *buildLegalLaneIntrinsic(IRBuilder<> &B, Intrinsic::ID IID,
+                                 Value *Data0, Value *Lane = nullptr,
+                                 Value *Data1 = nullptr);
 
   bool doInitialization(Module &M) override;
   bool runOnFunction(Function &F) override;
@@ -1344,6 +1348,16 @@
   switch (I.getIntrinsicID()) {
   case Intrinsic::bitreverse:
     return visitBitreverseIntrinsicInst(I);
+  case Intrinsic::amdgcn_readfirstlane:
+    return visitLaneIntrinsicInst(I, Intrinsic::amdgcn_readfirstlane2);
+  case Intrinsic::amdgcn_readlane:
+    return visitLaneIntrinsicInst(I, Intrinsic::amdgcn_readlane2);
+  case Intrinsic::amdgcn_writelane:
+    return visitLaneIntrinsicInst(I, Intrinsic::amdgcn_writelane2);
+  case Intrinsic::amdgcn_readfirstlane2:
+  case Intrinsic::amdgcn_readlane2:
+  case Intrinsic::amdgcn_writelane2:
+    return visitLaneIntrinsicInst(I, I.getIntrinsicID());
   default:
     return false;
   }
@@ -1359,6 +1373,140 @@
   return Changed;
 }
 
+Value *AMDGPUCodeGenPrepare::buildLegalLaneIntrinsic(IRBuilder<> &B,
+                                                     Intrinsic::ID IID,
+                                                     Value *Data0, Value *Lane,
+                                                     Value *Data1) {
+  Type *Ty = Data0->getType();
+
+  if (Ty == B.getInt32Ty()) {
+    Value *Args[3] = {Data0, Lane, Data1};
+    unsigned NumArgs = Data1 != nullptr ? 3 : Lane != nullptr ? 2 : 1;
+    return B.CreateIntrinsic(IID, {B.getInt32Ty()}, {Args, NumArgs});
+  }
+
+  if (auto *VecTy = dyn_cast<FixedVectorType>(Ty)) {
+    Type *EltType = VecTy->getElementType();
+    bool is16Bit =
+        (EltType->isIntegerTy() && EltType->getIntegerBitWidth() == 16) ||
+        (EltType->isHalfTy());
+    int EC = VecTy->getElementCount().Min;
+
+    Value *Result = UndefValue::get(Ty);
+    for (int i = 0; i < EC; i += 1 + is16Bit) {
+      Value *EltData0;
+      Value *EltData1 = nullptr;
+
+      if (is16Bit) {
+        int Idxs[2] = {i, i + 1};
+        EltData0 = B.CreateShuffleVector(Data0, UndefValue::get(Ty), Idxs);
+        EltData0 = B.CreateBitCast(EltData0, B.getInt32Ty());
+      } else {
+        EltData0 = B.CreateExtractElement(Data0, i);
+      }
+
+      if (Data1) {
+        if (is16Bit) {
+          int Idxs[2] = {i, i + 1};
+          EltData1 = B.CreateShuffleVector(Data1, UndefValue::get(Ty), Idxs);
+          EltData1 = B.CreateBitCast(EltData1, B.getInt32Ty());
+        } else {
+          EltData1 = B.CreateExtractElement(Data1, i);
+        }
+      }
+
+      Value *EltResult =
+          buildLegalLaneIntrinsic(B, IID, EltData0, Lane, EltData1);
+
+      if (is16Bit) {
+        EltResult =
+            B.CreateBitCast(EltResult, FixedVectorType::get(EltType, 2));
+        for (int j = 0; j < 2; ++j) {
+          if (i + j >= EC)
+            break;
+          Result = B.CreateInsertElement(
+              Result, B.CreateExtractElement(EltResult, j), i + j);
+        }
+      } else {
+        Result = B.CreateInsertElement(Result, EltResult, i);
+      }
+    }
+
+    return Result;
+  }
+
+  unsigned BitWidth = DL->getTypeSizeInBits(Ty);
+  Type *IntTy = Ty;
+
+  if (!Ty->isIntegerTy()) {
+    IntTy = IntegerType::get(Mod->getContext(), BitWidth);
+    Data0 = B.CreateBitOrPointerCast(Data0, IntTy);
+    if (Data1)
+      Data1 = B.CreateBitOrPointerCast(Data1, IntTy);
+  }
+
+  if ((BitWidth % 32) != 0) {
+    Type *ExtendedTy =
+        IntegerType::get(Mod->getContext(), (BitWidth + 31) & ~31);
+    Data0 = B.CreateZExt(Data0, ExtendedTy);
+    if (Data1)
+      Data1 = B.CreateZExt(Data1, ExtendedTy);
+  }
+
+  if (BitWidth > 32) {
+    Type *VecTy = FixedVectorType::get(B.getInt32Ty(), (BitWidth + 31) / 32);
+    Data0 = B.CreateBitCast(Data0, VecTy);
+    if (Data1)
+      Data1 = B.CreateBitCast(Data1, VecTy);
+  }
+
+  Value *Result = buildLegalLaneIntrinsic(B, IID, Data0, Lane, Data1);
+
+  if ((BitWidth % 32) != 0) {
+    if (BitWidth > 32) {
+      Result = B.CreateBitCast(
+          Result, IntegerType::get(Mod->getContext(), (BitWidth + 31) / 32));
+    }
+
+    Result =
+        B.CreateTrunc(Result, IntegerType::get(Mod->getContext(), BitWidth));
+  }
+
+  return B.CreateBitOrPointerCast(Result, Ty);
+}
+
+/// "Legalize" readfirstlane/readlane/writelane to single-dword intrinsics
+/// on i32.
+///
+/// Done during codegen prepare purely because this turned out to be simpler
+/// than doing it in this generality in SelectionDAG.
+bool AMDGPUCodeGenPrepare::visitLaneIntrinsicInst(IntrinsicInst &I,
+                                                  Intrinsic::ID CanonicalIID) {
+  Type *Ty = I.getType();
+  if (I.getIntrinsicID() == CanonicalIID && Ty->isIntegerTy(32) &&
+      Ty->getIntegerBitWidth() == 32)
+    return false; // already legal
+
+  Value *Data0 = I.getArgOperand(0);
+  Value *Lane = nullptr;
+  Value *Data1 = nullptr;
+
+  if (CanonicalIID == Intrinsic::amdgcn_readlane2) {
+    Lane = I.getArgOperand(1);
+  } else if (CanonicalIID == Intrinsic::amdgcn_writelane2) {
+    Lane = I.getArgOperand(1);
+    Data1 = I.getArgOperand(2);
+  }
+
+  IRBuilder<> Builder(&I);
+  Value *Legalized =
+      buildLegalLaneIntrinsic(Builder, CanonicalIID, Data0, Lane, Data1);
+
+  I.replaceAllUsesWith(Legalized);
+  I.eraseFromParent();
+  return true;
+}
+
 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
   Mod = &M;
   DL = &Mod->getDataLayout();
Index: llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -79,7 +79,7 @@
     const SIRegisterInfo *TRI
       = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
     if (TRI->isSGPRReg(MRI, PhysReg)) {
-      auto ToSGPR = MIRBuilder.buildIntrinsic(Intrinsic::amdgcn_readfirstlane,
+      auto ToSGPR = MIRBuilder.buildIntrinsic(Intrinsic::amdgcn_readfirstlane2,
                                               {MRI.getType(ExtReg)}, false)
         .addReg(ExtReg);
       ExtReg = ToSGPR.getReg(0);
Index: llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -285,11 +285,11 @@
   Type *const Ty = V->getType();
   Module *M = B.GetInsertBlock()->getModule();
   Function *UpdateDPP =
-      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
+      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, {Ty});
   Function *PermLaneX16 =
       Intrinsic::getDeclaration(M, Intrinsic::amdgcn_permlanex16, {});
   Function *ReadLane =
-      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane2, {Ty});
 
   for (unsigned Idx = 0; Idx < 4; Idx++) {
     V = buildNonAtomicBinOp(
@@ -344,11 +344,11 @@
   Type *const Ty = V->getType();
   Module *M = B.GetInsertBlock()->getModule();
   Function *UpdateDPP =
-      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
+      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, {Ty});
   Function *ReadLane =
-      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane2, {Ty});
   Function *WriteLane =
-      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
+      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane2, {Ty});
 
   if (ST->hasDPPWavefrontShifts()) {
     // GFX9 has DPP wavefront shift operations.
@@ -490,25 +490,8 @@
     // each active lane in the wavefront. This will be our new value which we
     // will provide to the atomic operation.
     Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
-    if (TyBitWidth == 64) {
-      Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty());
-      Value *const ExtractHi =
-          B.CreateTrunc(B.CreateLShr(NewV, 32), B.getInt32Ty());
-      CallInst *const ReadLaneLo = B.CreateIntrinsic(
-          Intrinsic::amdgcn_readlane, {}, {ExtractLo, LastLaneIdx});
-      CallInst *const ReadLaneHi = B.CreateIntrinsic(
-          Intrinsic::amdgcn_readlane, {}, {ExtractHi, LastLaneIdx});
-      Value *const PartialInsert = B.CreateInsertElement(
-          UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0));
-      Value *const Insert =
-          B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1));
-      NewV = B.CreateBitCast(Insert, Ty);
-    } else if (TyBitWidth == 32) {
-      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
-                               {NewV, LastLaneIdx});
-    } else {
-      llvm_unreachable("Unhandled atomic bit width");
-    }
+    NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane2, {Ty},
+                             {NewV, LastLaneIdx});
 
     // Finally mark the readlanes in the WWM section.
     NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);
@@ -587,27 +570,8 @@
     // We need to broadcast the value who was the lowest active lane (the first
     // lane) to all other lanes in the wavefront. We use an intrinsic for this,
     // but have to handle 64-bit broadcasts with two calls to this intrinsic.
-    Value *BroadcastI = nullptr;
-
-    if (TyBitWidth == 64) {
-      Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty());
-      Value *const ExtractHi =
-          B.CreateTrunc(B.CreateLShr(PHI, 32), B.getInt32Ty());
-      CallInst *const ReadFirstLaneLo =
-          B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
-      CallInst *const ReadFirstLaneHi =
-          B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
-      Value *const PartialInsert = B.CreateInsertElement(
-          UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
-      Value *const Insert =
-          B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
-      BroadcastI = B.CreateBitCast(Insert, Ty);
-    } else if (TyBitWidth == 32) {
-
-      BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI);
-    } else {
-      llvm_unreachable("Unhandled atomic bit width");
-    }
+    Value *BroadcastI =
+        B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane2, {Ty}, {PHI});
 
     // Now that we have the result of our single atomic operation, we need to
     // get our individual lane's slice into the result. We use the lane offset
Index: llvm/include/llvm/IR/IntrinsicsAMDGPU.td
===================================================================
--- llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1415,23 +1415,38 @@
   Intrinsic<[llvm_anyint_ty], [llvm_i1_ty],
             [IntrNoMem, IntrConvergent, IntrWillReturn]>;
 
-def int_amdgcn_readfirstlane :
-  GCCBuiltin<"__builtin_amdgcn_readfirstlane">,
-  Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
+def int_amdgcn_readfirstlane2 :
+  Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
             [IntrNoMem, IntrConvergent, IntrWillReturn]>;
 
 // The lane argument must be uniform across the currently active threads of the
 // current wave. Otherwise, the result is undefined.
-def int_amdgcn_readlane :
-  GCCBuiltin<"__builtin_amdgcn_readlane">,
-  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+def int_amdgcn_readlane2 :
+  Intrinsic<[llvm_any_ty],
+            [LLVMMatchType<0>,  // data input
+             llvm_i32_ty],      // uniform lane select
             [IntrNoMem, IntrConvergent, IntrWillReturn]>;
 
 // The value to write and lane select arguments must be uniform across the
 // currently active threads of the current wave. Otherwise, the result is
 // undefined.
+def int_amdgcn_writelane2 :
+  Intrinsic<[llvm_any_ty], [
+    LLVMMatchType<0>,  // uniform value to write: returned by the selected lane
+    llvm_i32_ty,       // uniform lane select
+    LLVMMatchType<0>   // returned by all lanes other than the selected one
+  ],
+  [IntrNoMem, IntrConvergent, IntrWillReturn]
+>;
+
+// Non-overloaded versions of readfirstlane2 / readlane2 / writelane2.
+def int_amdgcn_readfirstlane :
+  Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
+            [IntrNoMem, IntrConvergent, IntrWillReturn]>;
+def int_amdgcn_readlane :
+  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+            [IntrNoMem, IntrConvergent, IntrWillReturn]>;
 def int_amdgcn_writelane :
-  GCCBuiltin<"__builtin_amdgcn_writelane">,
   Intrinsic<[llvm_i32_ty], [
     llvm_i32_ty,    // uniform value to write: returned by the selected lane
     llvm_i32_ty,    // uniform lane select
Index: clang/test/CodeGenOpenCL/builtins-amdgcn.cl
===================================================================
--- clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -291,15 +291,15 @@
   *out = __builtin_amdgcn_ds_bpermute(a, b);
 }
 
-// CHECK-LABEL: @test_readfirstlane
-// CHECK: call i32 @llvm.amdgcn.readfirstlane(i32 %a)
+// CHECK-LABEL: @test_readfirstlane(
+// CHECK: call i32 @llvm.amdgcn.readfirstlane2.i32(i32 %a)
 void test_readfirstlane(global int* out, int a)
 {
   *out = __builtin_amdgcn_readfirstlane(a);
 }
 
 // CHECK-LABEL: @test_readlane
-// CHECK: call i32 @llvm.amdgcn.readlane(i32 %a, i32 %b)
+// CHECK: call i32 @llvm.amdgcn.readlane2.i32(i32 %a, i32 %b)
 void test_readlane(global int* out, int a, int b)
 {
   *out = __builtin_amdgcn_readlane(a, b);
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -14873,6 +14873,10 @@
     }
     LLVM_FALLTHROUGH;
   }
+  case AMDGPU::BI__builtin_amdgcn_readfirstlane:
+    return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_readfirstlane2);
+  case AMDGPU::BI__builtin_amdgcn_readlane:
+    return emitBinaryBuiltin(*this, E, Intrinsic::amdgcn_readlane2);
   default:
     return nullptr;
   }
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to