llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-clang @llvm/pr-subscribers-clang-driver Author: Stanislav Mekhanoshin (rampitec) <details> <summary>Changes</summary> - gfx1250 only supports cu mode --- Patch is 156.88 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/153680.diff 13 Files Affected: - (modified) clang/test/CodeGenHIP/hip-cumode.hip (+8-2) - (modified) clang/test/Driver/hip-macros.hip (+10-4) - (modified) llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp (+4-1) - (modified) llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp (+3-2) - (modified) llvm/lib/Target/AMDGPU/GCNSubtarget.h (+5-1) - (modified) llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp (+2-1) - (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp (+18-3) - (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h (+1) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll (+51-50) - (modified) llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll (+141-223) - (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll (+1211) - (modified) llvm/test/CodeGen/AMDGPU/packed-fp32.ll (+144-144) - (modified) llvm/test/MC/AMDGPU/hsa-diag-v4.s (+25-8) ``````````diff diff --git a/clang/test/CodeGenHIP/hip-cumode.hip b/clang/test/CodeGenHIP/hip-cumode.hip index 1aa1ca7a1a7ee..61fd53c644e8c 100644 --- a/clang/test/CodeGenHIP/hip-cumode.hip +++ b/clang/test/CodeGenHIP/hip-cumode.hip @@ -5,14 +5,20 @@ // RUN: %clang -S -o - --offload-arch=gfx906 --cuda-device-only -nogpuinc -nogpulib -mcumode \ // RUN: %s 2>&1 | FileCheck --check-prefix=NOWGP %s // RUN: %clang -S -o - --offload-arch=gfx906 --cuda-device-only -nogpuinc -nogpulib -mno-cumode \ -// RUN: %s 2>&1 | FileCheck --check-prefixes=NOWGP,WARN-CUMODE %s +// RUN: %s 2>&1 | FileCheck -DOFFLOAD_ARCH=gfx906 --check-prefixes=NOWGP,WARN-CUMODE %s // RUN: %clang -S -o - --offload-arch=gfx1030 --cuda-device-only -nogpuinc -nogpulib \ // RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-OFF %s // RUN: %clang -S -o - --offload-arch=gfx1030 --cuda-device-only -nogpuinc -nogpulib -mcumode \ // RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s // RUN: %clang -S -o - --offload-arch=gfx1030 --cuda-device-only -nogpuinc -nogpulib -mno-cumode \ // RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-OFF %s -// WARN-CUMODE: warning: ignoring '-mno-cumode' option as it is not currently supported for processor 'gfx906' [-Woption-ignored] +// RUN: %clang -S -o - --offload-arch=gfx1250 --cuda-device-only -nogpuinc -nogpulib \ +// RUN: %s 2>&1 | FileCheck --check-prefix=NOWGP %s +// RUN: %clang -S -o - --offload-arch=gfx1250 --cuda-device-only -nogpuinc -nogpulib -mcumode \ +// RUN: %s 2>&1 | FileCheck --check-prefix=NOWGP %s +// RUN: %clang -S -o - --offload-arch=gfx1250 --cuda-device-only -nogpuinc -nogpulib -mno-cumode \ +// RUN: %s 2>&1 | FileCheck -DOFFLOAD_ARCH=gfx1250 --check-prefixes=NOWGP,WARN-CUMODE %s +// WARN-CUMODE: warning: ignoring '-mno-cumode' option as it is not currently supported for processor '[[OFFLOAD_ARCH]]' [-Woption-ignored] // NOWGP-NOT: .amdhsa_workgroup_processor_mode // CUMODE-ON: .amdhsa_workgroup_processor_mode 0 // CUMODE-OFF: .amdhsa_workgroup_processor_mode 1 diff --git a/clang/test/Driver/hip-macros.hip b/clang/test/Driver/hip-macros.hip index bd93f9985a774..516e01a6c4743 100644 --- a/clang/test/Driver/hip-macros.hip +++ b/clang/test/Driver/hip-macros.hip @@ -27,21 +27,27 @@ // RUN: %clang -E -dM --offload-arch=gfx906 --cuda-device-only -nogpuinc -nogpulib -mcumode \ // RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s // RUN: %clang -E -dM --offload-arch=gfx906 --cuda-device-only -nogpuinc -nogpulib -mno-cumode \ -// RUN: %s 2>&1 | FileCheck --check-prefixes=CUMODE-ON,WARN-CUMODE %s +// RUN: %s 2>&1 | FileCheck -DOFFLOAD_ARCH=gfx906 --check-prefixes=CUMODE-ON,WARN-CUMODE %s // RUN: %clang -E -dM --offload-arch=gfx1030 --cuda-device-only -nogpuinc -nogpulib \ // RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-OFF %s // RUN: %clang -E -dM --offload-arch=gfx1030 --cuda-device-only -nogpuinc -nogpulib -mcumode \ // RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s // RUN: %clang -E -dM --offload-arch=gfx1030 --cuda-device-only -nogpuinc -nogpulib -mno-cumode \ // RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-OFF %s +// RUN: %clang -E -dM --offload-arch=gfx1250 --cuda-device-only -nogpuinc -nogpulib \ +// RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s +// RUN: %clang -E -dM --offload-arch=gfx1250 --cuda-device-only -nogpuinc -nogpulib -mcumode \ +// RUN: %s 2>&1 | FileCheck --check-prefix=CUMODE-ON %s +// RUN: %clang -E -dM --offload-arch=gfx1250 --cuda-device-only -nogpuinc -nogpulib -mno-cumode \ +// RUN: %s 2>&1 | FileCheck -DOFFLOAD_ARCH=gfx1250 --check-prefixes=CUMODE-ON,WARN-CUMODE %s // Check no duplicate warnings. // RUN: %clang -E -dM --offload-arch=gfx906 --cuda-device-only -nogpuinc -nogpulib -mcumode \ // RUN: -mno-cumode -mno-cumode \ -// RUN: %s 2>&1 | FileCheck --check-prefixes=CUMODE-ON,WARN-CUMODE %s +// RUN: %s 2>&1 | FileCheck -DOFFLOAD_ARCH=gfx906 --check-prefixes=CUMODE-ON,WARN-CUMODE %s -// WARN-CUMODE-DAG: warning: ignoring '-mno-cumode' option as it is not currently supported for processor 'gfx906' [-Woption-ignored] -// WARN-CUMODE-NOT: warning: ignoring '-mno-cumode' option as it is not currently supported for processor 'gfx906' [-Woption-ignored] +// WARN-CUMODE-DAG: warning: ignoring '-mno-cumode' option as it is not currently supported for processor '[[OFFLOAD_ARCH]]' [-Woption-ignored] +// WARN-CUMODE-NOT: warning: ignoring '-mno-cumode' option as it is not currently supported for processor '[[OFFLOAD_ARCH]]' [-Woption-ignored] // CUMODE-ON-DAG: #define __AMDGCN_CUMODE__ 1 // CUMODE-OFF-DAG: #define __AMDGCN_CUMODE__ 0 diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index c7d2d268a2707..188c126cb9fbe 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -1144,8 +1144,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, CreateExpr(STM.getWavefrontSize()), Ctx), CreateExpr(1ULL << ScratchAlignShift)); - if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) { + if (STM.supportsWGP()) { ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1; + } + + if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) { ProgInfo.MemOrdered = 1; ProgInfo.FwdProgress = 1; } diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 0184075c2c909..951473264d089 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -6270,8 +6270,9 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { PARSE_BITS_ENTRY(KD.compute_pgm_rsrc3, COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, ExprVal, ValRange); } else if (ID == ".amdhsa_workgroup_processor_mode") { - if (IVersion.Major < 10) - return Error(IDRange.Start, "directive requires gfx10+", IDRange); + if (!supportsWGP(getSTI())) + return Error(IDRange.Start, + "directive unsupported on " + getSTI().getCPU(), IDRange); PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE, ExprVal, ValRange); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index f47ddf5d93ec3..7ca7e8448c63d 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -390,7 +390,11 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, /// the original value. bool zeroesHigh16BitsOfDest(unsigned Opcode) const; - bool supportsWGP() const { return getGeneration() >= GFX10; } + bool supportsWGP() const { + if (GFX1250Insts) + return false; + return getGeneration() >= GFX10; + } bool hasIntClamp() const { return HasIntClamp; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 68302f0dd0d64..1f35e92151bfc 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -563,11 +563,12 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( PrintField(KD.compute_pgm_rsrc3, amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT, amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, ".amdhsa_tg_split"); - if (IVersion.Major >= 10) { + if (AMDGPU::supportsWGP(STI)) PrintField(KD.compute_pgm_rsrc1, amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE_SHIFT, amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE, ".amdhsa_workgroup_processor_mode"); + if (IVersion.Major >= 10) { PrintField(KD.compute_pgm_rsrc1, amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED_SHIFT, amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED, diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index ec9f1abdd8467..c41d62748c4be 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1167,12 +1167,21 @@ unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI) { unsigned getEUsPerCU(const MCSubtargetInfo *STI) { // "Per CU" really means "per whatever functional block the waves of a - // workgroup must share". For gfx10 in CU mode this is the CU, which contains + // workgroup must share". + + // GFX12.5 only supports CU mode, which contains four SIMDs. + if (isGFX1250(*STI)) { + assert(STI->getFeatureBits().test(FeatureCuMode)); + return 4; + } + + // For gfx10 in CU mode the functional block is the CU, which contains // two SIMDs. if (isGFX10Plus(*STI) && STI->getFeatureBits().test(FeatureCuMode)) return 2; - // Pre-gfx10 a CU contains four SIMDs. For gfx10 in WGP mode the WGP contains - // two CUs, so a total of four SIMDs. + + // Pre-gfx10 a CU contains four SIMDs. For gfx10 in WGP mode the WGP + // contains two CUs, so a total of four SIMDs. return 4; } @@ -2480,6 +2489,12 @@ bool isGFX1250(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureGFX1250Insts]; } +bool supportsWGP(const MCSubtargetInfo &STI) { + if (isGFX1250(STI)) + return false; + return isGFX10Plus(STI); +} + bool isNotGFX11Plus(const MCSubtargetInfo &STI) { return !isGFX11Plus(STI); } bool isNotGFX10Plus(const MCSubtargetInfo &STI) { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 704bf106ace76..befab68bb5698 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1549,6 +1549,7 @@ bool isGFX11Plus(const MCSubtargetInfo &STI); bool isGFX12(const MCSubtargetInfo &STI); bool isGFX12Plus(const MCSubtargetInfo &STI); bool isGFX1250(const MCSubtargetInfo &STI); +bool supportsWGP(const MCSubtargetInfo &STI); bool isNotGFX12Plus(const MCSubtargetInfo &STI); bool isNotGFX11Plus(const MCSubtargetInfo &STI); bool isGCN3Encoding(const MCSubtargetInfo &STI); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index 3daae98961bff..01854c8560ce2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -2854,89 +2854,90 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v0, v14, 0 -; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v0, v12, 0 +; GFX1250-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1 ; GFX1250-NEXT: v_mul_lo_u32 v27, v5, v10 ; GFX1250-NEXT: v_mul_lo_u32 v29, v3, v12 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v1, v13, v[16:17] -; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v1, v11, v[18:19] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v14, 0 +; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v16, v12, 0 +; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v17, v13, v[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19] ; GFX1250-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0 -; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v2, v12, v[16:17] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v2, v12, v[0:1] ; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX1250-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo -; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v0, v10, 0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v3, v11, v[16:17] +; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v16, v10, 0 +; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v3, v11, v[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo -; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v4, v10, v[16:17] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v4, v10, v[0:1] ; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v5, v9, v[16:17] -; GFX1250-NEXT: v_mad_nc_u64_u32 v[22:23], v6, v8, v[16:17] -; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v1, v9, v[20:21] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v5, v9, v[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_mad_nc_u64_u32 v[22:23], v6, v8, v[0:1] +; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21] ; GFX1250-NEXT: v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22 ; GFX1250-NEXT: v_mul_lo_u32 v22, v6, v9 ; GFX1250-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 -; GFX1250-NEXT: v_mad_co_u64_u32 v[24:25], s0, v2, v8, v[16:17] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v0, v13, v[20:21] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_mad_co_u64_u32 v[24:25], s0, v2, v8, v[0:1] +; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v16, v13, v[20:21] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v1, v12, v[20:21] +; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v12, v[20:21] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1250-NEXT: v_dual_mov_b32 v20, v25 :: v_dual_mov_b32 v21, v18 ; GFX1250-NEXT: v_mul_lo_u32 v25, v4, v11 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v0, v11, v[20:21] +; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v16, v11, v[20:21] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX1250-NEXT: v_cndmask_b32_e64 v28, 0, 1, s2 -; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[16:17] -; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v0, v8, 0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v1, v10, v[18:19] +; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[0:1] +; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v8, 0 +; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v17, v10, v[18:19] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s3, v3, v10, v[20:21] ; GFX1250-NEXT: v_mul_lo_u32 v20, v2, v13 ; GFX1250-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v28, s2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v2, v9, v[18:19] -; GFX1250-NEXT: v_dual_mov_b32 v18, v17 :: v_dual_mov_b32 v19, v24 +; GFX1250-NEXT: v_dual_mov_b32 v18, v1 :: v_dual_mov_b32 v19, v24 ; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s4, v4, v9, v[10:11] -; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v21, s2 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s6, v0, v9, v[18:19] -; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v15 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v21, s2 +; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s6, v16, v9, v[18:19] +; GFX1250-NEXT: v_mul_lo_u32 v2, v16, v15 ; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[12:13] ; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6 -; GFX1250-NEXT: v_mul_lo_u32 v9, v1, v14 +; GFX1250-NEXT: v_mul_lo_u32 v9, v17, v14 ; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[10:11] -; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v2, s2 -; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v1, v8, v[18:19] +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s2 +; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v17, v8, v[18:19] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_add_co_ci_u32_e64 v3, s2, v3, v12, s2 ; GFX1250-NEXT: v_add_co_ci_u32_e64 v4, s2, v6, v13, s2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_add_co_ci_u32_e64 v5, s2, v2, v10, s2 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v5, s2, v1, v10, s2 ; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v26, v11, s2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v23, v0, s2 -; GFX1250-NEXT: v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v1, v14 -; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v9, s5 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v23, v2, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, v15 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v9, s5 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v20, s4 -; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v29, s3 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v20, s4 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v29, s3 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v25, s1 -; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v27, s0 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v25, s1 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v27, s0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v22, vcc_lo -; GFX1250-NEXT: v_mad_u32 v7, v7, v8, v0 -; GFX1250-NEXT: v_mov_b32_e32 v0, v16 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v22, vcc_lo +; GFX1250-NEXT: v_mad_u32 v7, v7, v8, v1 +; GFX1250-NEXT: v_mov_b32_e32 v1, v14 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %result = mul i256 %num, %den ret i256 %result diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll index f0db321d3931a..e532deaca98a8 100644 --- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll +++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll @@ -2,7 +2,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-SDAG %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-GISEL %s -; Test S_WAIT_XCNT in... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/153680 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits