Author: Drew Kersnar
Date: 2025-08-19T11:13:57-05:00
New Revision: 069ad2353c4bd32b883e2d5ce4a7f191bf6098e1

URL: 
https://github.com/llvm/llvm-project/commit/069ad2353c4bd32b883e2d5ce4a7f191bf6098e1
DIFF: 
https://github.com/llvm/llvm-project/commit/069ad2353c4bd32b883e2d5ce4a7f191bf6098e1.diff

LOG: [NVPTXLowerArgs] Add align attribute to return value of addrspace.wrap 
intrinsic (#153889)

If alignment inference happens after NVPTXLowerArgs these addrspace wrap
intrinsics can prevent computeKnownBits from deriving alignment of
loads/stores from parameters. To solve this, we can insert an alignment
annotation on the generated intrinsic so that computeKnownBits does not
need to traverse through it to find the alignment.

Added: 
    llvm/test/CodeGen/NVPTX/lower-args-alignment.ll

Modified: 
    llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
    llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
    llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
    llvm/test/CodeGen/NVPTX/lower-args.ll
    llvm/test/CodeGen/NVPTX/lower-byval-args.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp 
b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 2445005bf98ce..520ce4deb9a57 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1027,9 +1027,16 @@ static inline bool isAddLike(const SDValue V) {
          (V->getOpcode() == ISD::OR && V->getFlags().hasDisjoint());
 }
 
+static SDValue stripAssertAlign(SDValue N) {
+  if (N.getOpcode() == ISD::AssertAlign)
+    N = N.getOperand(0);
+  return N;
+}
+
 // selectBaseADDR - Match a dag node which will serve as the base address for 
an
 // ADDR operand pair.
 static SDValue selectBaseADDR(SDValue N, SelectionDAG *DAG) {
+  N = stripAssertAlign(N);
   if (const auto *GA = dyn_cast<GlobalAddressSDNode>(N))
     return DAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N),
                                        GA->getValueType(0), GA->getOffset(),
@@ -1044,6 +1051,7 @@ static SDValue selectBaseADDR(SDValue N, SelectionDAG 
*DAG) {
 }
 
 static SDValue accumulateOffset(SDValue &Addr, SDLoc DL, SelectionDAG *DAG) {
+  Addr = stripAssertAlign(Addr);
   APInt AccumulatedOffset(64u, 0);
   while (isAddLike(Addr)) {
     const auto *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
@@ -1055,7 +1063,7 @@ static SDValue accumulateOffset(SDValue &Addr, SDLoc DL, 
SelectionDAG *DAG) {
       break;
 
     AccumulatedOffset += CI;
-    Addr = Addr->getOperand(0);
+    Addr = stripAssertAlign(Addr->getOperand(0));
   }
   return DAG->getSignedTargetConstant(AccumulatedOffset.getSExtValue(), DL,
                                       MVT::i32);

diff  --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp 
b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index f4362fe8d9056..e2bbe57c0085c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -412,6 +412,22 @@ static void adjustByValArgAlignment(Argument *Arg, Value 
*ArgInParamAS,
   }
 }
 
+// Create a call to the nvvm_internal_addrspace_wrap intrinsic and set the
+// alignment of the return value based on the alignment of the argument.
+static CallInst *createNVVMInternalAddrspaceWrap(IRBuilder<> &IRB,
+                                                 Argument &Arg) {
+  CallInst *ArgInParam =
+      IRB.CreateIntrinsic(Intrinsic::nvvm_internal_addrspace_wrap,
+                          {IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg.getType()},
+                          &Arg, {}, Arg.getName() + ".param");
+
+  if (MaybeAlign ParamAlign = Arg.getParamAlign())
+    ArgInParam->addRetAttr(
+        Attribute::getWithAlignment(ArgInParam->getContext(), *ParamAlign));
+
+  return ArgInParam;
+}
+
 namespace {
 struct ArgUseChecker : PtrUseVisitor<ArgUseChecker> {
   using Base = PtrUseVisitor<ArgUseChecker>;
@@ -515,10 +531,7 @@ void copyByValParam(Function &F, Argument &Arg) {
       Arg.getParamAlign().value_or(DL.getPrefTypeAlign(StructType)));
   Arg.replaceAllUsesWith(AllocA);
 
-  Value *ArgInParam =
-      IRB.CreateIntrinsic(Intrinsic::nvvm_internal_addrspace_wrap,
-                          {IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg.getType()},
-                          &Arg, {}, Arg.getName());
+  CallInst *ArgInParam = createNVVMInternalAddrspaceWrap(IRB, Arg);
 
   // Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX
   // addrspacecast preserves alignment.  Since params are constant, this load
@@ -549,9 +562,7 @@ static void handleByValParam(const NVPTXTargetMachine &TM, 
Argument *Arg) {
     SmallVector<Use *, 16> UsesToUpdate(llvm::make_pointer_range(Arg->uses()));
 
     IRBuilder<> IRB(&*FirstInst);
-    Value *ArgInParamAS = IRB.CreateIntrinsic(
-        Intrinsic::nvvm_internal_addrspace_wrap,
-        {IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg->getType()}, {Arg});
+    CallInst *ArgInParamAS = createNVVMInternalAddrspaceWrap(IRB, *Arg);
 
     for (Use *U : UsesToUpdate)
       convertToParamAS(U, ArgInParamAS, HasCvtaParam, IsGridConstant);
@@ -581,10 +592,7 @@ static void handleByValParam(const NVPTXTargetMachine &TM, 
Argument *Arg) {
     // argument already in the param address space, we need to use the noop
     // intrinsic, this had the added benefit of preventing other optimizations
     // from folding away this pair of addrspacecasts.
-    auto *ParamSpaceArg =
-        IRB.CreateIntrinsic(Intrinsic::nvvm_internal_addrspace_wrap,
-                            {IRB.getPtrTy(ADDRESS_SPACE_PARAM), 
Arg->getType()},
-                            Arg, {}, Arg->getName() + ".param");
+    auto *ParamSpaceArg = createNVVMInternalAddrspaceWrap(IRB, *Arg);
 
     // Cast param address to generic address space.
     Value *GenericArg = IRB.CreateAddrSpaceCast(

diff  --git a/llvm/test/CodeGen/NVPTX/lower-args-alignment.ll 
b/llvm/test/CodeGen/NVPTX/lower-args-alignment.ll
new file mode 100644
index 0000000000000..2051f6305cc03
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/lower-args-alignment.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 
UTC_ARGS: --version 5
+; RUN: opt < %s -passes=nvptx-lower-args,infer-alignment -S | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+; 
------------------------------------------------------------------------------
+; Test that alignment can be inferred through 
llvm.nvvm.internal.addrspace.wrap.p101.p0 intrinsics
+; thanks to the alignment attribute on the intrinsic
+; 
------------------------------------------------------------------------------
+
+%struct.S1 = type { i32, i32, i32, i32 }
+define ptx_kernel i32 @test_align8(ptr noundef readonly byval(%struct.S1) 
align 8 captures(none) %params) {
+; CHECK-LABEL: define ptx_kernel i32 @test_align8(
+; CHECK-SAME: ptr noundef readonly byval([[STRUCT_S1:%.*]]) align 8 
captures(none) [[PARAMS:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call align 8 ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[PARAMS]])
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(101) [[TMP0]], align 8
+; CHECK-NEXT:    ret i32 [[LOAD]]
+;
+entry:
+  %load = load i32, ptr %params, align 4
+  ret i32 %load
+}
+
+define ptx_kernel i32 @test_align1(ptr noundef readonly byval(%struct.S1) 
align 1 captures(none) %params) {
+; CHECK-LABEL: define ptx_kernel i32 @test_align1(
+; CHECK-SAME: ptr noundef readonly byval([[STRUCT_S1:%.*]]) align 4 
captures(none) [[PARAMS:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call align 1 ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[PARAMS]])
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(101) [[TMP0]], align 4
+; CHECK-NEXT:    ret i32 [[LOAD]]
+;
+entry:
+  %load = load i32, ptr %params, align 4
+  ret i32 %load
+}

diff  --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll 
b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
index 045704bdcd3fc..f5df0fcde1883 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
@@ -72,7 +72,7 @@ define ptx_kernel void @grid_const_int(ptr byval(i32) align 4 
%input1, i32 %inpu
 ; PTX-NEXT:    ret;
 ; OPT-LABEL: define ptx_kernel void @grid_const_int(
 ; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr 
[[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
-; OPT-NEXT:    [[INPUT11:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
+; OPT-NEXT:    [[INPUT11:%.*]] = call align 4 ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
 ; OPT-NEXT:    [[TMP:%.*]] = load i32, ptr addrspace(101) [[INPUT11]], align 4
 ; OPT-NEXT:    [[ADD:%.*]] = add i32 [[TMP]], [[INPUT2]]
 ; OPT-NEXT:    store i32 [[ADD]], ptr [[OUT]], align 4
@@ -101,7 +101,7 @@ define ptx_kernel void @grid_const_struct(ptr 
byval(%struct.s) align 4 %input, p
 ; PTX-NEXT:    ret;
 ; OPT-LABEL: define ptx_kernel void @grid_const_struct(
 ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr 
[[OUT:%.*]]) #[[ATTR0]] {
-; OPT-NEXT:    [[INPUT1:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
+; OPT-NEXT:    [[INPUT1:%.*]] = call align 4 ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
 ; OPT-NEXT:    [[GEP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr 
addrspace(101) [[INPUT1]], i32 0, i32 0
 ; OPT-NEXT:    [[GEP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr 
addrspace(101) [[INPUT1]], i32 0, i32 1
 ; OPT-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(101) [[GEP13]], align 4
@@ -137,7 +137,7 @@ define ptx_kernel void @grid_const_escape(ptr 
byval(%struct.s) align 4 %input) {
 ; PTX-NEXT:    ret;
 ; OPT-LABEL: define ptx_kernel void @grid_const_escape(
 ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]]) #[[ATTR0]] {
-; OPT-NEXT:    [[TMP1:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
+; OPT-NEXT:    [[TMP1:%.*]] = call align 4 ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
 ; OPT-NEXT:    [[INPUT_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) 
[[TMP1]] to ptr
 ; OPT-NEXT:    [[CALL:%.*]] = call i32 @escape(ptr [[INPUT_PARAM_GEN]])
 ; OPT-NEXT:    ret void
@@ -180,9 +180,9 @@ define ptx_kernel void @multiple_grid_const_escape(ptr 
byval(%struct.s) align 4
 ; PTX-NEXT:    ret;
 ; OPT-LABEL: define ptx_kernel void @multiple_grid_const_escape(
 ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], i32 [[A:%.*]], 
ptr byval(i32) align 4 [[B:%.*]]) #[[ATTR0]] {
-; OPT-NEXT:    [[TMP1:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[B]])
+; OPT-NEXT:    [[TMP1:%.*]] = call align 4 ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[B]])
 ; OPT-NEXT:    [[B_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] 
to ptr
-; OPT-NEXT:    [[TMP2:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
+; OPT-NEXT:    [[TMP2:%.*]] = call align 4 ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
 ; OPT-NEXT:    [[INPUT_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) 
[[TMP2]] to ptr
 ; OPT-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 ; OPT-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
@@ -208,7 +208,7 @@ define ptx_kernel void @grid_const_memory_escape(ptr 
byval(%struct.s) align 4 %i
 ; PTX-NEXT:    ret;
 ; OPT-LABEL: define ptx_kernel void @grid_const_memory_escape(
 ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr 
[[ADDR:%.*]]) #[[ATTR0]] {
-; OPT-NEXT:    [[TMP1:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
+; OPT-NEXT:    [[TMP1:%.*]] = call align 4 ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
 ; OPT-NEXT:    [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to 
ptr
 ; OPT-NEXT:    store ptr [[INPUT1]], ptr [[ADDR]], align 8
 ; OPT-NEXT:    ret void
@@ -235,7 +235,7 @@ define ptx_kernel void @grid_const_inlineasm_escape(ptr 
byval(%struct.s) align 4
 ; PTX-NOT      .local
 ; OPT-LABEL: define ptx_kernel void @grid_const_inlineasm_escape(
 ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr 
[[RESULT:%.*]]) #[[ATTR0]] {
-; OPT-NEXT:    [[TMP1:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
+; OPT-NEXT:    [[TMP1:%.*]] = call align 4 ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
 ; OPT-NEXT:    [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to 
ptr
 ; OPT-NEXT:    [[TMPPTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr 
[[INPUT1]], i32 0, i32 0
 ; OPT-NEXT:    [[TMPPTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr 
[[INPUT1]], i32 0, i32 1
@@ -357,7 +357,7 @@ define ptx_kernel void @grid_const_phi(ptr byval(%struct.s) 
align 4 %input1, ptr
 ; PTX-NEXT:    ret;
 ; OPT-LABEL: define ptx_kernel void @grid_const_phi(
 ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr 
[[INOUT:%.*]]) #[[ATTR0]] {
-; OPT-NEXT:    [[TMP1:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
+; OPT-NEXT:    [[TMP1:%.*]] = call align 4 ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
 ; OPT-NEXT:    [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) 
[[TMP1]] to ptr
 ; OPT-NEXT:    [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4
 ; OPT-NEXT:    [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
@@ -416,7 +416,7 @@ define ptx_kernel void @grid_const_phi_ngc(ptr 
byval(%struct.s) align 4 %input1,
 ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr 
byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[TMP1:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
 ; OPT-NEXT:    [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) 
[[TMP1]] to ptr
-; OPT-NEXT:    [[TMP2:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
+; OPT-NEXT:    [[TMP2:%.*]] = call align 4 ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
 ; OPT-NEXT:    [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) 
[[TMP2]] to ptr
 ; OPT-NEXT:    [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4
 ; OPT-NEXT:    [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
@@ -471,7 +471,7 @@ define ptx_kernel void @grid_const_select(ptr byval(i32) 
align 4 %input1, ptr by
 ; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) 
[[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[TMP1:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
 ; OPT-NEXT:    [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) 
[[TMP1]] to ptr
-; OPT-NEXT:    [[TMP2:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
+; OPT-NEXT:    [[TMP2:%.*]] = call align 4 ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
 ; OPT-NEXT:    [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) 
[[TMP2]] to ptr
 ; OPT-NEXT:    [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4
 ; OPT-NEXT:    [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
@@ -520,7 +520,7 @@ declare void @device_func(ptr byval(i32) align 4)
 define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 %input) {
 ; OPT-LABEL: define ptx_kernel void @test_forward_byval_arg(
 ; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR0]] {
-; OPT-NEXT:    [[INPUT_PARAM:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
+; OPT-NEXT:    [[INPUT_PARAM:%.*]] = call align 4 ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
 ; OPT-NEXT:    [[INPUT_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) 
[[INPUT_PARAM]] to ptr
 ; OPT-NEXT:    call void @device_func(ptr byval(i32) align 4 
[[INPUT_PARAM_GEN]])
 ; OPT-NEXT:    ret void

diff  --git a/llvm/test/CodeGen/NVPTX/lower-args.ll 
b/llvm/test/CodeGen/NVPTX/lower-args.ll
index 7c029ab516d6e..b4a51035c6610 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args.ll
@@ -200,7 +200,7 @@ define ptx_kernel void @ptr_as_int(i64 noundef %i, i32 
noundef %v) {
 define ptx_kernel void @ptr_as_int_aggr(ptr nocapture noundef readonly 
byval(%struct.S) align 8 %s, i32 noundef %v) {
 ; IRC-LABEL: define ptx_kernel void @ptr_as_int_aggr(
 ; IRC-SAME: ptr noundef readonly byval([[STRUCT_S:%.*]]) align 8 
captures(none) [[S:%.*]], i32 noundef [[V:%.*]]) {
-; IRC-NEXT:    [[S3:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
+; IRC-NEXT:    [[S3:%.*]] = call align 8 ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
 ; IRC-NEXT:    [[I:%.*]] = load i64, ptr addrspace(101) [[S3]], align 8
 ; IRC-NEXT:    [[P:%.*]] = inttoptr i64 [[I]] to ptr
 ; IRC-NEXT:    [[P1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1)
@@ -210,7 +210,7 @@ define ptx_kernel void @ptr_as_int_aggr(ptr nocapture 
noundef readonly byval(%st
 ;
 ; IRO-LABEL: define ptx_kernel void @ptr_as_int_aggr(
 ; IRO-SAME: ptr noundef readonly byval([[STRUCT_S:%.*]]) align 8 
captures(none) [[S:%.*]], i32 noundef [[V:%.*]]) {
-; IRO-NEXT:    [[S1:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
+; IRO-NEXT:    [[S1:%.*]] = call align 8 ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
 ; IRO-NEXT:    [[I:%.*]] = load i64, ptr addrspace(101) [[S1]], align 8
 ; IRO-NEXT:    [[P:%.*]] = inttoptr i64 [[I]] to ptr
 ; IRO-NEXT:    store i32 [[V]], ptr [[P]], align 4

diff  --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll 
b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
index 20a35198c3c16..4d36ff9496ede 100644
--- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -32,7 +32,7 @@ define dso_local ptx_kernel void @read_only(ptr nocapture 
noundef writeonly %out
 ; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @read_only(
 ; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr 
noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) 
local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; LOWER-ARGS-NEXT:  [[ENTRY:.*:]]
-; LOWER-ARGS-NEXT:    [[S3:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
+; LOWER-ARGS-NEXT:    [[S3:%.*]] = call align 4 ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
 ; LOWER-ARGS-NEXT:    [[I:%.*]] = load i32, ptr addrspace(101) [[S3]], align 4
 ; LOWER-ARGS-NEXT:    store i32 [[I]], ptr [[OUT]], align 4
 ; LOWER-ARGS-NEXT:    ret void
@@ -66,7 +66,7 @@ define dso_local ptx_kernel void @read_only_gep(ptr nocapture 
noundef writeonly
 ; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @read_only_gep(
 ; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr 
noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) 
local_unnamed_addr #[[ATTR0]] {
 ; LOWER-ARGS-NEXT:  [[ENTRY:.*:]]
-; LOWER-ARGS-NEXT:    [[S3:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
+; LOWER-ARGS-NEXT:    [[S3:%.*]] = call align 4 ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
 ; LOWER-ARGS-NEXT:    [[B4:%.*]] = getelementptr inbounds i8, ptr 
addrspace(101) [[S3]], i64 4
 ; LOWER-ARGS-NEXT:    [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4
 ; LOWER-ARGS-NEXT:    store i32 [[I]], ptr [[OUT]], align 4
@@ -128,7 +128,7 @@ define dso_local ptx_kernel void @escape_ptr(ptr nocapture 
noundef readnone %out
 ; COMMON-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef 
byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] 
{
 ; COMMON-NEXT:  [[ENTRY:.*:]]
 ; COMMON-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COMMON-NEXT:    [[S2:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
+; COMMON-NEXT:    [[S2:%.*]] = call align 4 ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
 ; COMMON-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr 
addrspace(101) align 4 [[S2]], i64 8, i1 false)
 ; COMMON-NEXT:    call void @_Z6escapePv(ptr noundef nonnull [[S1]]) 
#[[ATTR6:[0-9]+]]
 ; COMMON-NEXT:    ret void
@@ -167,7 +167,7 @@ define dso_local ptx_kernel void @escape_ptr_gep(ptr 
nocapture noundef readnone
 ; COMMON-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef 
byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COMMON-NEXT:  [[ENTRY:.*:]]
 ; COMMON-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COMMON-NEXT:    [[S2:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
+; COMMON-NEXT:    [[S2:%.*]] = call align 4 ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
 ; COMMON-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr 
addrspace(101) align 4 [[S2]], i64 8, i1 false)
 ; COMMON-NEXT:    [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4
 ; COMMON-NEXT:    call void @_Z6escapePv(ptr noundef nonnull [[B]]) #[[ATTR6]]
@@ -209,7 +209,7 @@ define dso_local ptx_kernel void @escape_ptr_store(ptr 
nocapture noundef writeon
 ; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef 
byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COMMON-NEXT:  [[ENTRY:.*:]]
 ; COMMON-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COMMON-NEXT:    [[S2:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
+; COMMON-NEXT:    [[S2:%.*]] = call align 4 ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
 ; COMMON-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr 
addrspace(101) align 4 [[S2]], i64 8, i1 false)
 ; COMMON-NEXT:    store ptr [[S1]], ptr [[OUT]], align 8
 ; COMMON-NEXT:    ret void
@@ -246,7 +246,7 @@ define dso_local ptx_kernel void @escape_ptr_gep_store(ptr 
nocapture noundef wri
 ; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef 
byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COMMON-NEXT:  [[ENTRY:.*:]]
 ; COMMON-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COMMON-NEXT:    [[S2:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
+; COMMON-NEXT:    [[S2:%.*]] = call align 4 ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
 ; COMMON-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr 
addrspace(101) align 4 [[S2]], i64 8, i1 false)
 ; COMMON-NEXT:    [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4
 ; COMMON-NEXT:    store ptr [[B]], ptr [[OUT]], align 8
@@ -286,7 +286,7 @@ define dso_local ptx_kernel void @escape_ptrtoint(ptr 
nocapture noundef writeonl
 ; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef 
byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COMMON-NEXT:  [[ENTRY:.*:]]
 ; COMMON-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COMMON-NEXT:    [[S2:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
+; COMMON-NEXT:    [[S2:%.*]] = call align 4 ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
 ; COMMON-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr 
addrspace(101) align 4 [[S2]], i64 8, i1 false)
 ; COMMON-NEXT:    [[I:%.*]] = ptrtoint ptr [[S1]] to i64
 ; COMMON-NEXT:    store i64 [[I]], ptr [[OUT]], align 8
@@ -324,7 +324,7 @@ define dso_local ptx_kernel void @memcpy_from_param(ptr 
nocapture noundef writeo
 ; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @memcpy_from_param(
 ; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr 
noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) 
local_unnamed_addr #[[ATTR0]] {
 ; LOWER-ARGS-NEXT:  [[ENTRY:.*:]]
-; LOWER-ARGS-NEXT:    [[S3:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
+; LOWER-ARGS-NEXT:    [[S3:%.*]] = call align 4 ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
 ; LOWER-ARGS-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr [[OUT]], ptr 
addrspace(101) [[S3]], i64 16, i1 true)
 ; LOWER-ARGS-NEXT:    ret void
 ;
@@ -445,7 +445,7 @@ define dso_local ptx_kernel void @memcpy_to_param(ptr 
nocapture noundef readonly
 ; COMMON-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef 
byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr 
#[[ATTR0]] {
 ; COMMON-NEXT:  [[ENTRY:.*:]]
 ; COMMON-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COMMON-NEXT:    [[S2:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
+; COMMON-NEXT:    [[S2:%.*]] = call align 4 ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
 ; COMMON-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr 
addrspace(101) align 4 [[S2]], i64 8, i1 false)
 ; COMMON-NEXT:    tail call void @llvm.memcpy.p0.p0.i64(ptr [[S1]], ptr 
[[IN]], i64 16, i1 true)
 ; COMMON-NEXT:    ret void
@@ -525,7 +525,7 @@ define dso_local ptx_kernel void @copy_on_store(ptr 
nocapture noundef readonly %
 ; COMMON-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef 
byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]], i1 noundef zeroext 
[[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; COMMON-NEXT:  [[BB:.*:]]
 ; COMMON-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
-; COMMON-NEXT:    [[S2:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
+; COMMON-NEXT:    [[S2:%.*]] = call align 4 ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
 ; COMMON-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr 
addrspace(101) align 4 [[S2]], i64 8, i1 false)
 ; COMMON-NEXT:    [[I:%.*]] = load i32, ptr [[IN]], align 4
 ; COMMON-NEXT:    store i32 [[I]], ptr [[S1]], align 4
@@ -551,7 +551,7 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 
%input1, ptr byval(i3
 ; SM_60-NEXT:    [[INPUT25:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
 ; SM_60-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT24]], 
ptr addrspace(101) align 4 [[INPUT25]], i64 4, i1 false)
 ; SM_60-NEXT:    [[INPUT11:%.*]] = alloca i32, align 4
-; SM_60-NEXT:    [[INPUT12:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
+; SM_60-NEXT:    [[INPUT12:%.*]] = call align 4 ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
 ; SM_60-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], 
ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false)
 ; SM_60-NEXT:    [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr 
[[INPUT24]]
 ; SM_60-NEXT:    [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
@@ -563,7 +563,7 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 
%input1, ptr byval(i3
 ; SM_70-NEXT:  [[BB:.*:]]
 ; SM_70-NEXT:    [[TMP0:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
 ; SM_70-NEXT:    [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) 
[[TMP0]] to ptr
-; SM_70-NEXT:    [[TMP1:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
+; SM_70-NEXT:    [[TMP1:%.*]] = call align 4 ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
 ; SM_70-NEXT:    [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) 
[[TMP1]] to ptr
 ; SM_70-NEXT:    [[PTRNEW:%.*]] = select i1 [[COND]], ptr 
[[INPUT1_PARAM_GEN]], ptr [[INPUT2_PARAM_GEN]]
 ; SM_70-NEXT:    [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
@@ -577,7 +577,7 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 
%input1, ptr byval(i3
 ; COPY-NEXT:    [[INPUT24:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
 ; COPY-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT23]], 
ptr addrspace(101) align 4 [[INPUT24]], i64 4, i1 false)
 ; COPY-NEXT:    [[INPUT11:%.*]] = alloca i32, align 4
-; COPY-NEXT:    [[INPUT12:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
+; COPY-NEXT:    [[INPUT12:%.*]] = call align 4 ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
 ; COPY-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], 
ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false)
 ; COPY-NEXT:    [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr 
[[INPUT23]]
 ; COPY-NEXT:    [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
@@ -637,7 +637,7 @@ define ptx_kernel void @test_select_write(ptr byval(i32) 
align 4 %input1, ptr by
 ; COMMON-NEXT:    [[INPUT24:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
 ; COMMON-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT23]], 
ptr addrspace(101) align 4 [[INPUT24]], i64 4, i1 false)
 ; COMMON-NEXT:    [[INPUT11:%.*]] = alloca i32, align 4
-; COMMON-NEXT:    [[INPUT12:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
+; COMMON-NEXT:    [[INPUT12:%.*]] = call align 4 ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
 ; COMMON-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], 
ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false)
 ; COMMON-NEXT:    [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr 
[[INPUT23]]
 ; COMMON-NEXT:    store i32 1, ptr [[PTRNEW]], align 4
@@ -682,7 +682,7 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 
4 %input1, ptr byval
 ; SM_60-NEXT:    [[INPUT25:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
 ; SM_60-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT24]], 
ptr addrspace(101) align 8 [[INPUT25]], i64 8, i1 false)
 ; SM_60-NEXT:    [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4
-; SM_60-NEXT:    [[INPUT12:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
+; SM_60-NEXT:    [[INPUT12:%.*]] = call align 4 ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
 ; SM_60-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], 
ptr addrspace(101) align 4 [[INPUT12]], i64 8, i1 false)
 ; SM_60-NEXT:    br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
 ; SM_60:       [[FIRST]]:
@@ -702,7 +702,7 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 
4 %input1, ptr byval
 ; SM_70-NEXT:  [[BB:.*:]]
 ; SM_70-NEXT:    [[TMP0:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
 ; SM_70-NEXT:    [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) 
[[TMP0]] to ptr
-; SM_70-NEXT:    [[TMP1:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
+; SM_70-NEXT:    [[TMP1:%.*]] = call align 4 ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
 ; SM_70-NEXT:    [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) 
[[TMP1]] to ptr
 ; SM_70-NEXT:    br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
 ; SM_70:       [[FIRST]]:
@@ -724,7 +724,7 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 
4 %input1, ptr byval
 ; COPY-NEXT:    [[INPUT24:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
 ; COPY-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT23]], 
ptr addrspace(101) align 8 [[INPUT24]], i64 8, i1 false)
 ; COPY-NEXT:    [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4
-; COPY-NEXT:    [[INPUT12:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
+; COPY-NEXT:    [[INPUT12:%.*]] = call align 4 ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
 ; COPY-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], 
ptr addrspace(101) align 4 [[INPUT12]], i64 8, i1 false)
 ; COPY-NEXT:    br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
 ; COPY:       [[FIRST]]:
@@ -808,7 +808,7 @@ define ptx_kernel void @test_phi_write(ptr byval(%struct.S) 
align 4 %input1, ptr
 ; COMMON-NEXT:    [[INPUT25:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
 ; COMMON-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT24]], 
ptr addrspace(101) align 8 [[INPUT25]], i64 8, i1 false)
 ; COMMON-NEXT:    [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4
-; COMMON-NEXT:    [[INPUT12:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
+; COMMON-NEXT:    [[INPUT12:%.*]] = call align 4 ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
 ; COMMON-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], 
ptr addrspace(101) align 4 [[INPUT12]], i64 8, i1 false)
 ; COMMON-NEXT:    br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
 ; COMMON:       [[FIRST]]:
@@ -871,7 +871,7 @@ define ptx_kernel void @test_forward_byval_arg(ptr 
byval(i32) align 4 %input) {
 ; COMMON-LABEL: define ptx_kernel void @test_forward_byval_arg(
 ; COMMON-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR3]] {
 ; COMMON-NEXT:    [[INPUT1:%.*]] = alloca i32, align 4
-; COMMON-NEXT:    [[INPUT2:%.*]] = call ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
+; COMMON-NEXT:    [[INPUT2:%.*]] = call align 4 ptr addrspace(101) 
@llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
 ; COMMON-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT1]], 
ptr addrspace(101) align 4 [[INPUT2]], i64 4, i1 false)
 ; COMMON-NEXT:    call void @device_func(ptr byval(i32) align 4 [[INPUT1]])
 ; COMMON-NEXT:    ret void


        
_______________________________________________
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

Reply via email to