[Mesa-dev] [PATCH] fbo-depth-array:Check completness with a color texture
--- tests/all.py| 2 +- tests/fbo/fbo-depth-array.c | 36 ++-- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/tests/all.py b/tests/all.py index 586cead..9aa600f 100644 --- a/tests/all.py +++ b/tests/all.py @@ -2831,7 +2831,7 @@ add_shader_test_dir(ext_texture_array, add_msaa_visual_plain_tests(ext_texture_array, 'copyteximage 1D_ARRAY') add_msaa_visual_plain_tests(ext_texture_array, 'copyteximage 2D_ARRAY') add_plain_test(ext_texture_array, 'fbo-array') -for test in ('depth-clear', 'depth-layered-clear', 'depth-draw', 'fs-writes-depth', +for test in ('depth-clear', 'depth-layered-clear', 'depth-stencil-color-clear', 'depth-draw', 'fs-writes-depth', 'stencil-clear', 'stencil-layered-clear', 'stencil-draw', 'fs-writes-stencil'): add_concurrent_test(ext_texture_array, 'fbo-depth-array ' + test) add_plain_test(ext_texture_array, 'array-texture') diff --git a/tests/fbo/fbo-depth-array.c b/tests/fbo/fbo-depth-array.c index 84370e4..dde807d 100644 --- a/tests/fbo/fbo-depth-array.c +++ b/tests/fbo/fbo-depth-array.c @@ -46,6 +46,7 @@ enum { CLEAR, LAYERED_CLEAR, + LAYERED_DEPTH_STENCIL_COLOR_CLEAR, DRAW, FS_WRITES_VALUE, }; @@ -135,11 +136,13 @@ static GLuint program_stencil_output; static GLuint program_texdepth; static GLuint program_texstencil; +static GLuint color_texture; + static float get_depth_value(unsigned layer) { - if (test == LAYERED_CLEAR) + if (test == LAYERED_CLEAR || LAYERED_DEPTH_STENCIL_COLOR_CLEAR) return 0.4; /* constant */ else return (double)(layer+1) / (layers+1); @@ -181,6 +184,10 @@ parse_args(int argc, char **argv) test = LAYERED_CLEAR; puts(Testing layered glClear); } + else if (!strcmp(argv[i], depth-stencil-color-layered-clear)) { + test = LAYERED_DEPTH_STENCIL_COLOR_CLEAR; + puts(Testing depth stencil color layered glClear); + } else if (!strcmp(argv[i], depth-draw)) { test = DRAW; puts(Testing drawing); @@ -224,6 +231,14 @@ create_array_fbo(void) int layer; glGenTextures(1, tex); + glGenTextures(1, color_texture); + + glBindTexture(GL_TEXTURE_2D_ARRAY, color_texture); + glTexImage3D(GL_TEXTURE_2D_ARRAY, 0, GL_RGBA, +width, height, layers, 0, +GL_RGBA, GL_UNSIGNED_INT, NULL); + assert(glGetError() == 0); + glBindTexture(GL_TEXTURE_2D_ARRAY, tex); assert(glGetError() == 0); @@ -241,11 +256,19 @@ create_array_fbo(void) /* draw something into each layer of the array texture */ for (layer = 0; layer layers; layer++) { - if (test == LAYERED_CLEAR) { - glFramebufferTexture(GL_FRAMEBUFFER, -test_stencil ? GL_STENCIL_ATTACHMENT : - GL_DEPTH_ATTACHMENT, -tex, 0); + if (test == LAYERED_CLEAR || test == LAYERED_DEPTH_STENCIL_COLOR_CLEAR) { + if (test == LAYERED_DEPTH_STENCIL_COLOR_CLEAR) { + glFramebufferTexture(GL_FRAMEBUFFER, +GL_COLOR_ATTACHMENT0_EXT, +color_texture, 0); + glFramebufferTexture(GL_FRAMEBUFFER, + GL_DEPTH_STENCIL_ATTACHMENT, +tex, 0); + } else + glFramebufferTexture(GL_FRAMEBUFFER, +test_stencil ? GL_STENCIL_ATTACHMENT : + GL_DEPTH_ATTACHMENT, +tex, 0); status = glCheckFramebufferStatus(GL_FRAMEBUFFER); if (status != GL_FRAMEBUFFER_COMPLETE) { @@ -433,6 +456,7 @@ test_once(void) } glDeleteTextures(1, tex); + glDeleteTextures(1, color_texture); assert(glGetError() == 0); return pass; } -- 1.9.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] Copy Layered field in reuse_framebuffer_texture_attachment
Fix #83596 https://bugs.freedesktop.org/show_bug.cgi?id=83596 --- src/mesa/main/fbobject.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c index ae3a418..5eaf1a3 100644 --- a/src/mesa/main/fbobject.c +++ b/src/mesa/main/fbobject.c @@ -2299,6 +2299,7 @@ reuse_framebuffer_texture_attachment(struct gl_framebuffer *fb, dst_att-Complete = src_att-Complete; dst_att-TextureLevel = src_att-TextureLevel; dst_att-Zoffset = src_att-Zoffset; + dst_att-Layered = src_att-Layered; } -- 1.9.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] PATCHES: R600: Implement work-around for CF stack HW bug
Some cosmetic comments below, otherwise the patches are: reviewed-by: Vincent Lejeune vljn at ovi.com -OutStreamer.EmitRawText( - Twine(; Kernel info:\n) + - ; NumSgprs: + Twine(KernelInfo.NumSGPR) + \n + - ; NumVgprs: + Twine(KernelInfo.NumVGPR) + \n); +if (STM.getGeneration() AMDGPUSubtarget::NORTHERN_ISLANDS) { + I think it would look cleaner without empty newline here + OutStreamer.EmitRawText( +Twine(; Kernel info:\n) + +; NumSgprs: + Twine(KernelInfo.NumSGPR) + \n + +; NumVgprs: + Twine(KernelInfo.NumVGPR) + \n); +} else { +void CFStack::pushBranch(unsigned Opcode, bool isWQM) { + CFStack::StackItem Item = CFStack::ENTRY; + switch(Opcode) { + case AMDGPU::CF_PUSH_EG: + case AMDGPU::CF_ALU_PUSH_BEFORE: +if (!isWQM) { + if (!ST.hasCaymanISA() !branchStackContains(CFStack::FIRST_NON_WQM_PUSH)) +Item = CFStack::FIRST_NON_WQM_PUSH; // May not be required on Evergreen/NI + // See comment in + // CFStack::getSubEntrySize() + else if (CurrentEntries 0 + ST.getGeneration() AMDGPUSubtarget::EVERGREEN + !ST.hasCaymanISA() + !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY)) +Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY; + else +Item = CFStack::SUB_ENTRY; +} else { + Item = CFStack::ENTRY; It's a single line statement, I think it should be without brace. +} +break; case AMDGPU::CF_ALU_PUSH_BEFORE: - CurrentStack++; - MaxStack = std::max(MaxStack, CurrentStack); - HasPush = true; - if (ST.hasCaymanISA() CurrentLoopDepth 1) { + if (ST.hasCaymanISA() CFStack.getLoopDepth() 1) { BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII-get(AMDGPU::CF_PUSH_EG)) .addImm(CfCount + 1) .addImm(1); MI-setDesc(TII-get(AMDGPU::CF_ALU)); CfCount++; +CFStack.pushBranch(AMDGPU::CF_PUSH_EG); + } else { +CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE); Here too } +bool CFStack::requiresWorkAroundForInst(unsigned Opcode) { + if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE ST.hasCaymanISA() + getLoopDepth() 1) { +return true; And here too + } Thank for this patch set, stack bugs are really not easy to spot and fix. Vincent Le Mercredi 11 décembre 2013 19h07, Tom Stellard t...@stellard.net a écrit : Hi, The attached patches implement a work-around for the CF stack HW bug that is present on some Evergreen and NI GPUs. Please Review. -Tom ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] r600/llvm: Allow arbitrary amount of temps in tgsi to llvm
--- src/gallium/drivers/radeon/radeon_llvm.h | 6 +++ .../drivers/radeon/radeon_setup_tgsi_llvm.c| 43 -- 2 files changed, 45 insertions(+), 4 deletions(-) diff --git a/src/gallium/drivers/radeon/radeon_llvm.h b/src/gallium/drivers/radeon/radeon_llvm.h index 2cab6b0..00714fb 100644 --- a/src/gallium/drivers/radeon/radeon_llvm.h +++ b/src/gallium/drivers/radeon/radeon_llvm.h @@ -112,6 +112,12 @@ struct radeon_llvm_context { LLVMValueRef outputs[RADEON_LLVM_MAX_OUTPUTS][TGSI_NUM_CHANNELS]; unsigned output_reg_count; + /** This pointer is used to contain the temporary values. + * The amount of temporary used in tgsi can't be bound to a max value and + * thus we must allocate this array at runtime. + */ + LLVMValueRef *temps; + unsigned temps_count; LLVMValueRef system_values[RADEON_LLVM_MAX_SYSTEM_VALUES]; /*=== Private Members ===*/ diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c index 3bb01ec..4c30de4 100644 --- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c +++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c @@ -142,6 +142,13 @@ emit_array_fetch( return result; } +static bool uses_temp_indirect_addressing( + struct lp_build_tgsi_context *bld_base) +{ + struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base); + return (bld-indirect_files (1 TGSI_FILE_TEMPORARY)); +} + static LLVMValueRef emit_fetch( struct lp_build_tgsi_context *bld_base, @@ -184,7 +191,11 @@ emit_fetch( break; case TGSI_FILE_TEMPORARY: - ptr = lp_get_temp_ptr_soa(bld, reg-Register.Index, swizzle); + if (uses_temp_indirect_addressing(bld_base)) { + ptr = lp_get_temp_ptr_soa(bld, reg-Register.Index, swizzle); + break; + } + ptr = ctx-temps[reg-Register.Index * TGSI_NUM_CHANNELS + swizzle]; result = LLVMBuildLoad(builder, ptr, ); break; @@ -216,6 +227,7 @@ static void emit_declaration( const struct tgsi_full_declaration *decl) { struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base); + unsigned first, last, i, idx; switch(decl-Declaration.File) { case TGSI_FILE_ADDRESS: { @@ -234,7 +246,23 @@ static void emit_declaration( case TGSI_FILE_TEMPORARY: if (decl-Declaration.Array decl-Array.ArrayID = RADEON_LLVM_MAX_ARRAYS) ctx-arrays[decl-Array.ArrayID - 1] = decl-Range; - lp_emit_declaration_soa(bld_base, decl); + if (uses_temp_indirect_addressing(bld_base)) { + lp_emit_declaration_soa(bld_base, decl); + break; + } + first = decl-Range.First; + last = decl-Range.Last; + if (!ctx-temps_count) { + ctx-temps_count = bld_base-info-file_max[TGSI_FILE_TEMPORARY] + 1; + ctx-temps = MALLOC(TGSI_NUM_CHANNELS * ctx-temps_count * sizeof(LLVMValueRef)); + } + for (idx = first; idx = last; idx++) { + for (i = 0; i TGSI_NUM_CHANNELS; i++) { + ctx-temps[idx * TGSI_NUM_CHANNELS + i] = + lp_build_alloca(bld_base-base.gallivm, bld_base-base.vec_type, + temp); + } + } break; case TGSI_FILE_INPUT: @@ -284,6 +312,7 @@ emit_store( const struct tgsi_opcode_info * info, LLVMValueRef dst[4]) { + struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base); struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base); struct gallivm_state *gallivm = bld-bld_base.base.gallivm; struct lp_build_context base = bld-bld_base.base; @@ -359,7 +388,10 @@ emit_store( break; case TGSI_FILE_TEMPORARY: - temp_ptr = lp_get_temp_ptr_soa(bld, i + range.First, chan_index); + if (uses_temp_indirect_addressing(bld_base)) + temp_ptr = lp_get_temp_ptr_soa(bld, i + range.First, chan_index); + else + temp_ptr = ctx-temps[(i + range.First) * TGSI_NUM_CHANNELS + chan_index]; break; default: @@ -377,7 +409,9 @@ emit_store( break; case TGSI_FILE_TEMPORARY: - temp_ptr = lp_get_temp_ptr_soa(bld,
[Mesa-dev] [PATCH] r600/llvm: Allow arbitrary amount of temps in tgsi to llvm
--- src/gallium/drivers/radeon/radeon_llvm.h | 5 +++ .../drivers/radeon/radeon_setup_tgsi_llvm.c| 41 +++--- 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/src/gallium/drivers/radeon/radeon_llvm.h b/src/gallium/drivers/radeon/radeon_llvm.h index 2cab6b0..6d84f44 100644 --- a/src/gallium/drivers/radeon/radeon_llvm.h +++ b/src/gallium/drivers/radeon/radeon_llvm.h @@ -112,6 +112,11 @@ struct radeon_llvm_context { LLVMValueRef outputs[RADEON_LLVM_MAX_OUTPUTS][TGSI_NUM_CHANNELS]; unsigned output_reg_count; + /** +* @brief system_values +*/ + LLVMValueRef *temps; + unsigned temps_count; LLVMValueRef system_values[RADEON_LLVM_MAX_SYSTEM_VALUES]; /*=== Private Members ===*/ diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c index 3bb01ec..c897b03 100644 --- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c +++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c @@ -184,7 +184,11 @@ emit_fetch( break; case TGSI_FILE_TEMPORARY: - ptr = lp_get_temp_ptr_soa(bld, reg-Register.Index, swizzle); + if (false) { + ptr = lp_get_temp_ptr_soa(bld, reg-Register.Index, swizzle); + break; + } + ptr = ctx-temps[reg-Register.Index * TGSI_NUM_CHANNELS + swizzle]; result = LLVMBuildLoad(builder, ptr, ); break; @@ -200,6 +204,13 @@ emit_fetch( return bitcast(bld_base, type, result); } +static bool uses_temp_indirect_addressing( + struct lp_build_tgsi_context *bld_base) +{ + struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base); + return !(bld-indirect_files (1 TGSI_FILE_TEMPORARY)); +} + static LLVMValueRef fetch_system_value( struct lp_build_tgsi_context * bld_base, const struct tgsi_full_src_register *reg, @@ -234,7 +245,22 @@ static void emit_declaration( case TGSI_FILE_TEMPORARY: if (decl-Declaration.Array decl-Array.ArrayID = RADEON_LLVM_MAX_ARRAYS) ctx-arrays[decl-Array.ArrayID - 1] = decl-Range; - lp_emit_declaration_soa(bld_base, decl); + if (uses_temp_indirect_addressing(bld_base)) { + lp_emit_declaration_soa(bld_base, decl); + break; + } + unsigned first = decl-Range.First, last = decl-Range.Last; + if (!ctx-temps_count) { + ctx-temps_count = bld_base-info-file_max[TGSI_FILE_TEMPORARY] + 1; + ctx-temps = MALLOC(TGSI_NUM_CHANNELS * ctx-temps_count * sizeof(LLVMValueRef)); + } + for (unsigned idx = first; idx = last; idx++) { + for (unsigned i = 0; i TGSI_NUM_CHANNELS; i++) { + ctx-temps[idx * TGSI_NUM_CHANNELS + i] = + lp_build_alloca(bld_base-base.gallivm, bld_base-base.vec_type, + temp); + } + } break; case TGSI_FILE_INPUT: @@ -284,6 +310,7 @@ emit_store( const struct tgsi_opcode_info * info, LLVMValueRef dst[4]) { + struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base); struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base); struct gallivm_state *gallivm = bld-bld_base.base.gallivm; struct lp_build_context base = bld-bld_base.base; @@ -359,7 +386,10 @@ emit_store( break; case TGSI_FILE_TEMPORARY: - temp_ptr = lp_get_temp_ptr_soa(bld, i + range.First, chan_index); + if (uses_temp_indirect_addressing(bld_base)) + temp_ptr = lp_get_temp_ptr_soa(bld, i + range.First, chan_index); + else + temp_ptr = ctx-temps[(i + range.First) * TGSI_NUM_CHANNELS + chan_index]; break; default: @@ -377,7 +407,9 @@ emit_store( break; case TGSI_FILE_TEMPORARY: - temp_ptr = lp_get_temp_ptr_soa(bld, reg-Register.Index, chan_index); + if (uses_temp_indirect_addressing(bld_base)) + break; + temp_ptr = ctx-temps[ TGSI_NUM_CHANNELS * reg-Register.Index + chan_index]; break; default: @@ -1392,4 +1424,5 @@ void radeon_llvm_dispose(struct
Re: [Mesa-dev] [PATCH] R600: Make sure OQAP defs and uses happen in the same clause
This patch is : reviewed-by: Vincent Lejeunevljn at ovi.com - Mail original - De : Tom Stellard t...@stellard.net À : Vincent Lejeune v...@ovi.com Cc : mesa-dev@lists.freedesktop.org mesa-dev@lists.freedesktop.org; llvm-comm...@cs.uiuc.edu llvm-comm...@cs.uiuc.edu; Tom Stellard thomas.stell...@amd.com Envoyé le : Jeudi 14 novembre 2013 1h53 Objet : Re: [PATCH] R600: Make sure OQAP defs and uses happen in the same clause Hi Vincent, I discovered a bug in the previous patch. Here is an updated versions. -Tom On Tue, Nov 12, 2013 at 03:01:42PM -0800, Tom Stellard wrote: Hi Vincent, Here is an updated patch where I added a call to SubstituteKCacheBank() in canClauseLocalKillFitInClause() This should prevent OQAP uses and defs from being split because of constant bank limitations. Maybe we can leave the ScheduleDAGMutation optimization as a future TODO. -Tom On Sun, Nov 03, 2013 at 10:19:16AM -0800, Vincent Lejeune wrote: I have put some comments below but otherwise the patch is reviewed-by: Vincent Lejeune vljn at ovi.com -- next part -- From 2eb4673e3184af0e077cbe30a594602441e8d98e Mon Sep 17 00:00:00 2001 From: Tom Stellard thomas.stellard at amd.com Date: Thu, 5 Sep 2013 08:59:32 -0700 Subject: [PATCH] R600: Fix scheduling of instructions that use the LDS output queue The LDS output queue is accessed via the OQAP register. The OQAP register cannot be live across clauses, so if value is written to the output queue, it must be retrieved before the end of the clause. With the machine scheduler, we cannot statisfy this constraint, because it lacks proper alias analysis and it will mark some LDS accesses as having a chain dependency on vertex fetches. Since vertex fetches We can customize the dependency graph before machine scheduling takes place, using ScheduleDAGMutation. I already wrote some code to break artificial dependencies between vector subregister read/write here : http://cgit.freedesktop.org/~vlj/llvm/commit/?h=vliw5id=e91b16a22845d0a80ed348f158ae7ab293e003a8 While I'm expecting from Matthias Braun's Subregister patches to be upstreamed to obsolete most of this patch except tests, it can be reworked so that it'll parse all MEM dependency, and remove the ones between instructions touching different memory pool (like VTX_FETCH and LDS_READ). require a new clauses, the dependency may end up spiltting OQAP uses and defs so the end up in different clauses. See the lds-output-queue.ll test for a more detailed explanation. To work around this issue, we now combine the LDS read and the OQAP copy into one instruction and expand it after register allocation. This patch also adds some checks to the EmitClauseMarker pass, so that it doesn't end a clause with a value still in the output queue and removes AR.X and OQAP handling from the scheduler (AR.X uses and defs were already being expanded post-RA, so the scheduler will never see them). --- lib/Target/R600/R600EmitClauseMarkers.cpp | 52 ++ lib/Target/R600/R600ExpandSpecialInstrs.cpp | 17 + lib/Target/R600/R600ISelLowering.cpp | 20 +++--- lib/Target/R600/R600InstrInfo.cpp | 8 +++ lib/Target/R600/R600InstrInfo.h | 2 + lib/Target/R600/R600MachineScheduler.cpp | 32 - lib/Target/R600/R600MachineScheduler.h | 2 - lib/Target/R600/R600RegisterInfo.cpp | 13 lib/Target/R600/R600RegisterInfo.h | 2 + test/CodeGen/R600/lds-output-queue.ll | 99 +++ test/CodeGen/R600/local-memory-two-objects.ll | 8 ++- 11 files changed, 206 insertions(+), 49 deletions(-) create mode 100644 test/CodeGen/R600/lds-output-queue.ll diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp b/lib/Target/R600/R600EmitClauseMarkers.cpp + bool canClauseLocalKillFitInClause( + unsigned AluInstCount, + MachineBasicBlock::iterator Def, + MachineBasicBlock::iterator BBEnd) { + const R600RegisterInfo TRI = TII-getRegisterInfo(); + for (MachineInstr::const_mop_iterator + MOI = Def-operands_begin(), + MOE = Def-operands_end(); MOI != MOE; ++MOI) { + if (!MOI-isReg() || !MOI-isDef() || + TRI.isPhysRegLiveAcrossClauses(MOI-getReg())) + continue; + + // Def defines a clause local register, so check that its use will fit + // in the clause. + unsigned LastUseCount = 0; + for (MachineBasicBlock::iterator UseI = Def; UseI != BBEnd; ++UseI) { + AluInstCount += OccupiedDwords(UseI); + // We have reached the maximum instruction limit before
[Mesa-dev] [PATCH] r600/llvm: Store inputs in function arguments
--- src/gallium/drivers/r600/r600_llvm.c | 119 +++ src/gallium/drivers/r600/r600_shader.c | 1 + src/gallium/drivers/radeon/radeon_llvm.h | 1 + 3 files changed, 121 insertions(+) diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c index 5afe3cb..a2ff0ec 100644 --- a/src/gallium/drivers/r600/r600_llvm.c +++ b/src/gallium/drivers/r600/r600_llvm.c @@ -77,6 +77,11 @@ static void llvm_load_system_value( default: assert(!unknown system value); } +#if HAVE_LLVM = 0x0304 + ctx-system_values[index] = LLVMBuildExtractElement(ctx-gallivm.builder, + LLVMGetParam(ctx-main_fn, 0), lp_build_const_int32((ctx-gallivm), chan), + ); +#else LLVMValueRef reg = lp_build_const_int32( ctx-soa.bld_base.base.gallivm, chan); ctx-system_values[index] = build_intrinsic( @@ -84,8 +89,49 @@ static void llvm_load_system_value( llvm.R600.load.input, ctx-soa.bld_base.base.elem_type, reg, 1, LLVMReadNoneAttribute); +#endif } +#if HAVE_LLVM = 0x0304 +static LLVMValueRef +llvm_load_input_vector( + struct radeon_llvm_context * ctx, unsigned location, unsigned ijregs, + boolean interp) +{ + LLVMTypeRef VecType; + LLVMValueRef Args[3] = { + lp_build_const_int32((ctx-gallivm), location) + }; + unsigned ArgCount = 1; + if (interp) { + VecType = LLVMVectorType(ctx-soa.bld_base.base.elem_type, 2); + LLVMValueRef IJIndex = LLVMGetParam(ctx-main_fn, ijregs / 2); + Args[ArgCount++] = LLVMBuildExtractElement(ctx-gallivm.builder, IJIndex, + lp_build_const_int32((ctx-gallivm), 2 * (ijregs % 2)), ); + Args[ArgCount++] = LLVMBuildExtractElement(ctx-gallivm.builder, IJIndex, + lp_build_const_int32((ctx-gallivm), 2 * (ijregs % 2) + 1), ); + LLVMValueRef HalfVec[2] = { + build_intrinsic(ctx-gallivm.builder, llvm.R600.interp.xy, + VecType, Args, ArgCount, LLVMReadNoneAttribute), + build_intrinsic(ctx-gallivm.builder, llvm.R600.interp.zw, + VecType, Args, ArgCount, LLVMReadNoneAttribute) + }; + LLVMValueRef MaskInputs[4] = { + lp_build_const_int32((ctx-gallivm), 0), + lp_build_const_int32((ctx-gallivm), 1), + lp_build_const_int32((ctx-gallivm), 2), + lp_build_const_int32((ctx-gallivm), 3) + }; + LLVMValueRef Mask = LLVMConstVector(MaskInputs, 4); + return LLVMBuildShuffleVector(ctx-gallivm.builder, HalfVec[0], HalfVec[1], + Mask, ); + } else { + VecType = LLVMVectorType(ctx-soa.bld_base.base.elem_type, 4); + return build_intrinsic(ctx-gallivm.builder, llvm.R600.interp.const, + VecType, Args, ArgCount, LLVMReadNoneAttribute); + } +} +#else static LLVMValueRef llvm_load_input_helper( struct radeon_llvm_context * ctx, @@ -110,7 +156,22 @@ llvm_load_input_helper( return build_intrinsic(bb-gallivm-builder, intrinsic, bb-elem_type, arg[0], arg_count, LLVMReadNoneAttribute); } +#endif +#if HAVE_LLVM = 0x0304 +static LLVMValueRef +llvm_face_select_helper( + struct radeon_llvm_context * ctx, + LLVMValueRef face, LLVMValueRef front_color, LLVMValueRef back_color) +{ + const struct lp_build_context * bb = ctx-soa.bld_base.base; + LLVMValueRef is_front = LLVMBuildFCmp( + bb-gallivm-builder, LLVMRealUGT, face, + lp_build_const_float(bb-gallivm, 0.0f),); + return LLVMBuildSelect(bb-gallivm-builder, is_front, + front_color, back_color, ); +} +#else static LLVMValueRef llvm_face_select_helper( struct radeon_llvm_context * ctx, @@ -124,6 +185,7 @@ llvm_face_select_helper( return LLVMBuildSelect(bb-gallivm-builder, is_front, front_color, back_color, ); } +#endif static void llvm_load_input( struct radeon_llvm_context * ctx, @@ -132,11 +194,55 @@ static void llvm_load_input( { const struct r600_shader_io * input = ctx-r600_inputs[input_index]; unsigned chan; +#if HAVE_LLVM 0x0304 unsigned interp = 0; int ij_index; +#endif int two_side = (ctx-two_side input-name == TGSI_SEMANTIC_COLOR); LLVMValueRef v; +#if HAVE_LLVM = 0x0304 + boolean
[Mesa-dev] [PATCH] r600/llvm: Store inputs in function arguments
--- src/gallium/drivers/r600/r600_llvm.c | 125 - src/gallium/drivers/r600/r600_shader.c | 2 + src/gallium/drivers/radeon/radeon_llvm.h | 1 + .../drivers/radeon/radeon_setup_tgsi_llvm.c| 2 +- 4 files changed, 75 insertions(+), 55 deletions(-) diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c index 5afe3cb..8dcda1a 100644 --- a/src/gallium/drivers/r600/r600_llvm.c +++ b/src/gallium/drivers/r600/r600_llvm.c @@ -87,37 +87,50 @@ static void llvm_load_system_value( } static LLVMValueRef -llvm_load_input_helper( - struct radeon_llvm_context * ctx, - unsigned idx, int interp, int ij_index) +llvm_load_input_vector( + struct radeon_llvm_context * ctx, unsigned location, unsigned ijregs, + boolean interp) { - const struct lp_build_context * bb = ctx-soa.bld_base.base; - LLVMValueRef arg[2]; - int arg_count; - const char * intrinsic; - - arg[0] = lp_build_const_int32(bb-gallivm, idx); - - if (interp) { - intrinsic = llvm.R600.interp.input; - arg[1] = lp_build_const_int32(bb-gallivm, ij_index); - arg_count = 2; - } else { - intrinsic = llvm.R600.load.input; - arg_count = 1; - } - - return build_intrinsic(bb-gallivm-builder, intrinsic, - bb-elem_type, arg[0], arg_count, LLVMReadNoneAttribute); + LLVMTypeRef VecType; + LLVMValueRef Args[2] = { + lp_build_const_int32((ctx-gallivm), location) + }; + unsigned ArgCount = 1; + if (interp) { + VecType = LLVMVectorType(ctx-soa.bld_base.base.elem_type, 2); + LLVMValueRef IJIndex = LLVMGetParam(ctx-main_fn, ijregs / 2); + Args[ArgCount++] = LLVMBuildExtractElement(ctx-gallivm.builder, IJIndex, + lp_build_const_int32((ctx-gallivm), 2 * (ijregs % 2)), ); + Args[ArgCount++] = LLVMBuildExtractElement(ctx-gallivm.builder, IJIndex, + lp_build_const_int32((ctx-gallivm), 2 * (ijregs % 2) + 1), ); + LLVMValueRef HalfVec[2] = { + build_intrinsic(ctx-gallivm.builder, llvm.R600.interp.xy, + VecType, Args, ArgCount, LLVMReadNoneAttribute), + build_intrinsic(ctx-gallivm.builder, llvm.R600.interp.zw, + VecType, Args, ArgCount, LLVMReadNoneAttribute) + }; + LLVMValueRef MaskInputs[4] = { + lp_build_const_int32((ctx-gallivm), 0), + lp_build_const_int32((ctx-gallivm), 1), + lp_build_const_int32((ctx-gallivm), 2), + lp_build_const_int32((ctx-gallivm), 3) + }; + LLVMValueRef Mask = LLVMConstVector(MaskInputs, 4); + return LLVMBuildShuffleVector(ctx-gallivm.builder, HalfVec[0], HalfVec[1], + Mask, ); + } else { + VecType = LLVMVectorType(ctx-soa.bld_base.base.elem_type, 4); + return build_intrinsic(ctx-gallivm.builder, llvm.R600.interp.const, + VecType, Args, ArgCount, LLVMReadNoneAttribute); + } } static LLVMValueRef llvm_face_select_helper( struct radeon_llvm_context * ctx, - unsigned face_loc, LLVMValueRef front_color, LLVMValueRef back_color) + LLVMValueRef face, LLVMValueRef front_color, LLVMValueRef back_color) { const struct lp_build_context * bb = ctx-soa.bld_base.base; - LLVMValueRef face = llvm_load_input_helper(ctx, face_loc, 0, 0); LLVMValueRef is_front = LLVMBuildFCmp( bb-gallivm-builder, LLVMRealUGT, face, lp_build_const_float(bb-gallivm, 0.0f),); @@ -132,50 +145,46 @@ static void llvm_load_input( { const struct r600_shader_io * input = ctx-r600_inputs[input_index]; unsigned chan; - unsigned interp = 0; - int ij_index; int two_side = (ctx-two_side input-name == TGSI_SEMANTIC_COLOR); LLVMValueRef v; + boolean require_interp_intrinsic = ctx-chip_class = EVERGREEN + ctx-type == TGSI_PROCESSOR_FRAGMENT; - if (ctx-chip_class = EVERGREEN ctx-type == TGSI_PROCESSOR_FRAGMENT - input-spi_sid) { - interp = 1; - ij_index = (input-interpolate 0) ? input-ij_index : -1; - } + if (require_interp_intrinsic input-spi_sid) { + v = llvm_load_input_vector(ctx, input-lds_pos, input-ij_index, + (input-interpolate 0)); +
[Mesa-dev] [PATCH] R600: Use function inputs to represent data stored in gpr
--- lib/Target/R600/AMDGPUCallingConv.td| 15 - lib/Target/R600/R600ISelLowering.cpp| 55 ++-- lib/Target/R600/R600Instructions.td | 2 +- lib/Target/R600/R600Intrinsics.td | 8 ++- test/CodeGen/R600/big_alu.ll| 85 - test/CodeGen/R600/complex-folding.ll| 9 +-- test/CodeGen/R600/floor.ll | 14 ++-- test/CodeGen/R600/fmad.ll | 20 +++--- test/CodeGen/R600/fmax.ll | 13 ++-- test/CodeGen/R600/fmin.ll | 13 ++-- test/CodeGen/R600/llvm.AMDGPU.mul.ll| 16 ++--- test/CodeGen/R600/llvm.cos.ll | 12 ++-- test/CodeGen/R600/llvm.pow.ll | 16 ++--- test/CodeGen/R600/llvm.sin.ll | 12 ++-- test/CodeGen/R600/load-input-fold.ll| 29 - test/CodeGen/R600/max-literals.ll | 25 test/CodeGen/R600/pv-packing.ll | 25 +++- test/CodeGen/R600/pv.ll | 61 +- test/CodeGen/R600/r600-encoding.ll | 15 +++-- test/CodeGen/R600/r600-export-fix.ll| 14 ++-- test/CodeGen/R600/r600cfg.ll| 14 ++-- test/CodeGen/R600/reciprocal.ll | 13 ++-- test/CodeGen/R600/rv7x0_count3.ll | 19 +++--- test/CodeGen/R600/schedule-fs-loop-nested-if.ll | 13 ++-- test/CodeGen/R600/schedule-vs-if-nested-loop.ll | 14 ++-- test/CodeGen/R600/shared-op-cycle.ll| 16 ++--- test/CodeGen/R600/swizzle-export.ll | 32 -- test/CodeGen/R600/tex-clause-antidep.ll | 13 ++-- test/CodeGen/R600/texture-input-merge.ll| 13 ++-- 29 files changed, 285 insertions(+), 321 deletions(-) diff --git a/lib/Target/R600/AMDGPUCallingConv.td b/lib/Target/R600/AMDGPUCallingConv.td index a194e6d..bb7d6f8 100644 --- a/lib/Target/R600/AMDGPUCallingConv.td +++ b/lib/Target/R600/AMDGPUCallingConv.td @@ -42,6 +42,17 @@ def CC_SI : CallingConv[ ]; +// Calling convention for R600 +def CC_R600 : CallingConv[ + CCIfInRegCCIfType[v4f32, v4i32] , CCAssignToReg[ +T0_XYZW, T1_XYZW, T2_XYZW, T3_XYZW, T4_XYZW, T5_XYZW, T6_XYZW, T7_XYZW, +T8_XYZW, T9_XYZW, T10_XYZW, T11_XYZW, T12_XYZW, T13_XYZW, T14_XYZW, T15_XYZW, +T16_XYZW, T17_XYZW, T18_XYZW, T19_XYZW, T20_XYZW, T21_XYZW, T22_XYZW, +T23_XYZW, T24_XYZW, T25_XYZW, T26_XYZW, T27_XYZW, T28_XYZW, T29_XYZW, +T30_XYZW, T31_XYZW, T32_XYZW + ] +]; + // Calling convention for compute kernels def CC_AMDGPU_Kernel : CallingConv[ CCIfType[v4i32, v4f32], CCAssignToStack 16, 16, @@ -61,5 +72,7 @@ def CC_AMDGPU : CallingConv[ State.getMachineFunction().getInfoR600MachineFunctionInfo()- ShaderType == ShaderType::COMPUTE, CCDelegateToCC_AMDGPU_Kernel, CCIfState.getTarget().getSubtargetAMDGPUSubtarget()# - .getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS, CCDelegateToCC_SI + .getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS, CCDelegateToCC_SI, + CCIfState.getTarget().getSubtargetAMDGPUSubtarget()# + .getGeneration() AMDGPUSubtarget::SOUTHERN_ISLANDS, CCDelegateToCC_R600 ]; diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index 3c2e388..deab985 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -554,51 +554,23 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG DAG) const SDLoc DL(Op); switch(IntrinsicID) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); -case AMDGPUIntrinsic::R600_load_input: { - int64_t RegIndex = castConstantSDNode(Op.getOperand(1))-getZExtValue(); - unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); - MachineFunction MF = DAG.getMachineFunction(); - MachineRegisterInfo MRI = MF.getRegInfo(); - MRI.addLiveIn(Reg); - return DAG.getCopyFromReg(DAG.getEntryNode(), - SDLoc(DAG.getEntryNode()), Reg, VT); -} - -case AMDGPUIntrinsic::R600_interp_input: { +case AMDGPUIntrinsic::R600_interp_xy: +case AMDGPUIntrinsic::R600_interp_zw: { int slot = castConstantSDNode(Op.getOperand(1))-getZExtValue(); - int ijb = castConstantSDNode(Op.getOperand(2))-getSExtValue(); MachineSDNode *interp; - if (ijb 0) { -const MachineFunction MF = DAG.getMachineFunction(); -const R600InstrInfo *TII = - static_castconst R600InstrInfo*(MF.getTarget().getInstrInfo()); -interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL, -MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32)); -return DAG.getTargetExtractSubreg( -TII-getRegisterInfo().getSubRegFromChannel(slot % 4), -DL, MVT::f32, SDValue(interp, 0)); - } + SDValue RegisterINode = Op.getOperand(2); + SDValue
Re: [Mesa-dev] [PATCH] R600: Make sure OQAP defs and uses happen in the same clause
I have put some comments below but otherwise the patch is reviewed-by: Vincent Lejeune vljn at ovi.com -- next part -- From 2eb4673e3184af0e077cbe30a594602441e8d98e Mon Sep 17 00:00:00 2001 From: Tom Stellard thomas.stellard at amd.com Date: Thu, 5 Sep 2013 08:59:32 -0700 Subject: [PATCH] R600: Fix scheduling of instructions that use the LDS output queue The LDS output queue is accessed via the OQAP register. The OQAP register cannot be live across clauses, so if value is written to the output queue, it must be retrieved before the end of the clause. With the machine scheduler, we cannot statisfy this constraint, because it lacks proper alias analysis and it will mark some LDS accesses as having a chain dependency on vertex fetches. Since vertex fetches We can customize the dependency graph before machine scheduling takes place, using ScheduleDAGMutation. I already wrote some code to break artificial dependencies between vector subregister read/write here : http://cgit.freedesktop.org/~vlj/llvm/commit/?h=vliw5id=e91b16a22845d0a80ed348f158ae7ab293e003a8 While I'm expecting from Matthias Braun's Subregister patches to be upstreamed to obsolete most of this patch except tests, it can be reworked so that it'll parse all MEM dependency, and remove the ones between instructions touching different memory pool (like VTX_FETCH and LDS_READ). require a new clauses, the dependency may end up spiltting OQAP uses and defs so the end up in different clauses. See the lds-output-queue.ll test for a more detailed explanation. To work around this issue, we now combine the LDS read and the OQAP copy into one instruction and expand it after register allocation. This patch also adds some checks to the EmitClauseMarker pass, so that it doesn't end a clause with a value still in the output queue and removes AR.X and OQAP handling from the scheduler (AR.X uses and defs were already being expanded post-RA, so the scheduler will never see them). --- lib/Target/R600/R600EmitClauseMarkers.cpp | 52 ++ lib/Target/R600/R600ExpandSpecialInstrs.cpp | 17 + lib/Target/R600/R600ISelLowering.cpp | 20 +++--- lib/Target/R600/R600InstrInfo.cpp | 8 +++ lib/Target/R600/R600InstrInfo.h | 2 + lib/Target/R600/R600MachineScheduler.cpp | 32 - lib/Target/R600/R600MachineScheduler.h| 2 - lib/Target/R600/R600RegisterInfo.cpp | 13 lib/Target/R600/R600RegisterInfo.h| 2 + test/CodeGen/R600/lds-output-queue.ll | 99 +++ test/CodeGen/R600/local-memory-two-objects.ll | 8 ++- 11 files changed, 206 insertions(+), 49 deletions(-) create mode 100644 test/CodeGen/R600/lds-output-queue.ll diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp b/lib/Target/R600/R600EmitClauseMarkers.cpp + bool canClauseLocalKillFitInClause( + unsigned AluInstCount, + MachineBasicBlock::iterator Def, + MachineBasicBlock::iterator BBEnd) { +const R600RegisterInfo TRI = TII-getRegisterInfo(); +for (MachineInstr::const_mop_iterator + MOI = Def-operands_begin(), + MOE = Def-operands_end(); MOI != MOE; ++MOI) { + if (!MOI-isReg() || !MOI-isDef() || + TRI.isPhysRegLiveAcrossClauses(MOI-getReg())) +continue; + + // Def defines a clause local register, so check that its use will fit + // in the clause. + unsigned LastUseCount = 0; + for (MachineBasicBlock::iterator UseI = Def; UseI != BBEnd; ++UseI) { +AluInstCount += OccupiedDwords(UseI); +// We have reached the maximum instruction limit before finding the +// use that kills this register, so we cannot use this def in the +// current clause. +if (AluInstCount = TII-getMaxAlusPerClause()) + return false; + +// Register kill flags have been cleared by the time we get to this +// pass, but it is safe to assume that all uses of this register +// occur in the same basic block as its definition, because +// it is illegal for the scheduler to schedule them in +// different blocks. +if (UseI-findRegisterUseOperandIdx(MOI-getReg())) + LastUseCount = AluInstCount; + +if (UseI != Def UseI-findRegisterDefOperandIdx(MOI-getReg()) != -1) + break; + } + if (LastUseCount) +return LastUseCount = TII-getMaxAlusPerClause(); + llvm_unreachable(Clause local register live at end of clause.); +} +return true; + } This function does not check if current clause can hold all constant bank. I think it's likely to be rare for a clause to be split because of constant bank limitations, but it would be better to have an assertion failure in such case to make debugging easier. For instance if the SubstituteKCacheBank return false, you can check that there is no lds use
[Mesa-dev] [PATCH 1/2] r600/llvm: Fix texbuf for pre EG gen
R600/R700 implementation of tex buffer fetch requires the result of the VFETCH instruction to be ANDed with R600_BUFFER_INFO_CONST_BUFFER, and the last channel to be ORed with the same const buffer. --- src/gallium/drivers/r600/r600_llvm.c | 29 + 1 file changed, 29 insertions(+) diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c index 34dd3ad..d7fa5f8 100644 --- a/src/gallium/drivers/r600/r600_llvm.c +++ b/src/gallium/drivers/r600/r600_llvm.c @@ -427,6 +427,35 @@ static void llvm_emit_tex( emit_data-output[0] = build_intrinsic(gallivm-builder, llvm.R600.load.texbuf, emit_data-dst_type, args, 2, LLVMReadNoneAttribute); + if (ctx-chip_class = EVERGREEN) + return; + ctx-uses_tex_buffers = true; + LLVMDumpValue(emit_data-output[0]); + emit_data-output[0] = LLVMBuildBitCast(gallivm-builder, + emit_data-output[0], LLVMVectorType(bld_base-base.int_elem_type, 4), + ); + LLVMValueRef Mask = llvm_load_const_buffer(bld_base, + lp_build_const_int32(gallivm, 0), + LLVM_R600_BUFFER_INFO_CONST_BUFFER); + Mask = LLVMBuildBitCast(gallivm-builder, Mask, + LLVMVectorType(bld_base-base.int_elem_type, 4), ); + emit_data-output[0] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_AND, + emit_data-output[0], + Mask); + LLVMValueRef WComponent = LLVMBuildExtractElement(gallivm-builder, + emit_data-output[0], lp_build_const_int32(gallivm, 3), ); + Mask = llvm_load_const_buffer(bld_base, lp_build_const_int32(gallivm, 1), + LLVM_R600_BUFFER_INFO_CONST_BUFFER); + Mask = LLVMBuildExtractElement(gallivm-builder, Mask, + lp_build_const_int32(gallivm, 0), ); + Mask = LLVMBuildBitCast(gallivm-builder, Mask, + bld_base-base.int_elem_type, ); + WComponent = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_OR, + WComponent, Mask); + emit_data-output[0] = LLVMBuildInsertElement(gallivm-builder, + emit_data-output[0], WComponent, lp_build_const_int32(gallivm, 3), ); + emit_data-output[0] = LLVMBuildBitCast(gallivm-builder, + emit_data-output[0], LLVMVectorType(bld_base-base.elem_type, 4), ); } return; default: -- 1.8.3.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/2] r600/llvm: Fix isampleBuffer on preEG
On R600/R700 hw the data are stored from the channel 2 of the second dword. --- src/gallium/drivers/r600/r600_llvm.c | 15 ++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c index d7fa5f8..5afe3cb 100644 --- a/src/gallium/drivers/r600/r600_llvm.c +++ b/src/gallium/drivers/r600/r600_llvm.c @@ -415,9 +415,22 @@ static void llvm_emit_tex( case TGSI_OPCODE_TXQ: { struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base); ctx-uses_tex_buffers = true; - LLVMValueRef offset = lp_build_const_int32(bld_base-base.gallivm, 0); + bool isEgPlus = (ctx-chip_class = EVERGREEN); + LLVMValueRef offset = lp_build_const_int32(bld_base-base.gallivm, + isEgPlus ? 0 : 1); LLVMValueRef cvecval = llvm_load_const_buffer(bld_base, offset, LLVM_R600_BUFFER_INFO_CONST_BUFFER); + if (!isEgPlus) { + LLVMValueRef maskval[4] = { + lp_build_const_int32(gallivm, 1), + lp_build_const_int32(gallivm, 2), + lp_build_const_int32(gallivm, 3), + lp_build_const_int32(gallivm, 0), + }; + LLVMValueRef mask = LLVMConstVector(maskval, 4); + cvecval = LLVMBuildShuffleVector(gallivm-builder, cvecval, cvecval, + mask, ); + } emit_data-output[0] = cvecval; return; } -- 1.8.3.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH] R600: Make sure OQAP defs and uses happen in the same clause
This patch should work when checking than no OQAP is used before beeing queued, assuming that a value in OQAP is consumed and cannot be read twice. However I'm not sure I cover all LDS instructions that queues a value, I only use LDS_RET_READ in switch case. Vincent - Mail original - De : Tom Stellard t...@stellard.net À : Vincent Lejeune v...@ovi.com Cc : llvm-comm...@cs.uiuc.edu llvm-comm...@cs.uiuc.edu; mesa-dev@lists.freedesktop.org mesa-dev@lists.freedesktop.org; Tom Stellard thomas.stell...@amd.com Envoyé le : Mardi 22 octobre 2013 23h20 Objet : Re: [PATCH] R600: Make sure OQAP defs and uses happen in the same clause Hi Vincent, Here is an updated patch. I wasn't sure where to put the assertion to check that UnscheduledNoLiveOut{Defs,Uses} is empty when switching to a new clause. I tried adding it to R600SchedStartegy::schedNode() behind the if (NextInstKind != CurInstKind) condition, but it always failed. Any suggestions on where I should but it? -Tom On Mon, Oct 21, 2013 at 12:40:28PM -0700, Vincent Lejeune wrote: - Mail original - De : Tom Stellard t...@stellard.net À : llvm-comm...@cs.uiuc.edu Cc : mesa-dev@lists.freedesktop.org; Tom Stellard thomas.stell...@amd.com Envoyé le : Vendredi 11 octobre 2013 20h10 Objet : [PATCH] R600: Make sure OQAP defs and uses happen in the same clause From: Tom Stellard thomas.stell...@amd.com Reading the special OQAP register pops the top value off the LDS input queue and returns it to the instruction. This queue is invalidated at the end of an ALU clause and leaving values in the queue can lead to GPU hangs. This means that if we load a value into the queue, we must use it before the end of the clause. This fixes some hangs in the OpenCV test suite. --- lib/Target/R600/R600MachineScheduler.cpp | 25 + lib/Target/R600/R600MachineScheduler.h | 4 ++-- test/CodeGen/R600/lds-input-queue.ll | 26 ++ 3 files changed, 41 insertions(+), 14 deletions(-) create mode 100644 test/CodeGen/R600/lds-input-queue.ll diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp index 6c26d9e..611b7f4 100644 --- a/lib/Target/R600/R600MachineScheduler.cpp +++ b/lib/Target/R600/R600MachineScheduler.cpp @@ -93,11 +93,12 @@ SUnit* R600SchedStrategy::pickNode(bool IsTopNode) { } - // We want to scheduled AR defs as soon as possible to make sure they aren't - // put in a different ALU clause from their uses. - if (!SU !UnscheduledARDefs.empty()) { - SU = UnscheduledARDefs[0]; - UnscheduledARDefs.erase(UnscheduledARDefs.begin()); + // We want to scheduled defs that cannot be live outside of this clause + // as soon as possible to make sure they aren't put in a different + // ALU clause from their uses. + if (!SU !UnscheduledNoLiveOutDefs.empty()) { + SU = UnscheduledNoLiveOutDefs[0]; + UnscheduledNoLiveOutDefs.erase(UnscheduledNoLiveOutDefs.begin()); NextInstKind = IDAlu; } @@ -132,9 +133,9 @@ SUnit* R600SchedStrategy::pickNode(bool IsTopNode) { // We want to schedule the AR uses as late as possible to make sure that // the AR defs have been released. - if (!SU !UnscheduledARUses.empty()) { - SU = UnscheduledARUses[0]; - UnscheduledARUses.erase(UnscheduledARUses.begin()); + if (!SU !UnscheduledNoLiveOutUses.empty()) { + SU = UnscheduledNoLiveOutUses[0]; + UnscheduledNoLiveOutUses.erase(UnscheduledNoLiveOutUses.begin()); Can we use std::queueSUnit* instead of a std::vector for UnscheduledNoLiveOutUses ? I had to use a vector because I needed to be able to pop non topmost SUnit in some case (to fit Instruction Group const read limitation) but I would rather avoid erase(iterator) call when possible. NextInstKind = IDAlu; } @@ -217,15 +218,15 @@ void R600SchedStrategy::releaseBottomNode(SUnit *SU) { int IK = getInstKind(SU); - // Check for AR register defines + // Check for registers that do not live across ALU clauses. for (MachineInstr::const_mop_iterator I = SU-getInstr()-operands_begin(), E = SU-getInstr()-operands_end(); I != E; ++I) { - if (I-isReg() I-getReg() == AMDGPU::AR_X) { + if (I-isReg() (I-getReg() == AMDGPU::AR_X || I-getReg() == AMDGPU::OQAP)) { if (I-isDef()) { - UnscheduledARDefs.push_back(SU); + UnscheduledNoLiveOutDefs.push_back(SU); } else { - UnscheduledARUses.push_back(SU); + UnscheduledNoLiveOutUses.push_back(SU); } return; } diff --git a/lib/Target/R600
[Mesa-dev] [PATCH 1/2] r600/llvm: Fix texbuf for pre EG gen
--- src/gallium/drivers/r600/r600_llvm.c | 29 + 1 file changed, 29 insertions(+) diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c index 34dd3ad..d7fa5f8 100644 --- a/src/gallium/drivers/r600/r600_llvm.c +++ b/src/gallium/drivers/r600/r600_llvm.c @@ -427,6 +427,35 @@ static void llvm_emit_tex( emit_data-output[0] = build_intrinsic(gallivm-builder, llvm.R600.load.texbuf, emit_data-dst_type, args, 2, LLVMReadNoneAttribute); + if (ctx-chip_class = EVERGREEN) + return; + ctx-uses_tex_buffers = true; + LLVMDumpValue(emit_data-output[0]); + emit_data-output[0] = LLVMBuildBitCast(gallivm-builder, + emit_data-output[0], LLVMVectorType(bld_base-base.int_elem_type, 4), + ); + LLVMValueRef Mask = llvm_load_const_buffer(bld_base, + lp_build_const_int32(gallivm, 0), + LLVM_R600_BUFFER_INFO_CONST_BUFFER); + Mask = LLVMBuildBitCast(gallivm-builder, Mask, + LLVMVectorType(bld_base-base.int_elem_type, 4), ); + emit_data-output[0] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_AND, + emit_data-output[0], + Mask); + LLVMValueRef WComponent = LLVMBuildExtractElement(gallivm-builder, + emit_data-output[0], lp_build_const_int32(gallivm, 3), ); + Mask = llvm_load_const_buffer(bld_base, lp_build_const_int32(gallivm, 1), + LLVM_R600_BUFFER_INFO_CONST_BUFFER); + Mask = LLVMBuildExtractElement(gallivm-builder, Mask, + lp_build_const_int32(gallivm, 0), ); + Mask = LLVMBuildBitCast(gallivm-builder, Mask, + bld_base-base.int_elem_type, ); + WComponent = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_OR, + WComponent, Mask); + emit_data-output[0] = LLVMBuildInsertElement(gallivm-builder, + emit_data-output[0], WComponent, lp_build_const_int32(gallivm, 3), ); + emit_data-output[0] = LLVMBuildBitCast(gallivm-builder, + emit_data-output[0], LLVMVectorType(bld_base-base.elem_type, 4), ); } return; default: -- 1.8.3.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/2] r600/llvm: Fix isampleBuffer on preEG
--- src/gallium/drivers/r600/r600_llvm.c | 15 ++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c index d7fa5f8..5afe3cb 100644 --- a/src/gallium/drivers/r600/r600_llvm.c +++ b/src/gallium/drivers/r600/r600_llvm.c @@ -415,9 +415,22 @@ static void llvm_emit_tex( case TGSI_OPCODE_TXQ: { struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base); ctx-uses_tex_buffers = true; - LLVMValueRef offset = lp_build_const_int32(bld_base-base.gallivm, 0); + bool isEgPlus = (ctx-chip_class = EVERGREEN); + LLVMValueRef offset = lp_build_const_int32(bld_base-base.gallivm, + isEgPlus ? 0 : 1); LLVMValueRef cvecval = llvm_load_const_buffer(bld_base, offset, LLVM_R600_BUFFER_INFO_CONST_BUFFER); + if (!isEgPlus) { + LLVMValueRef maskval[4] = { + lp_build_const_int32(gallivm, 1), + lp_build_const_int32(gallivm, 2), + lp_build_const_int32(gallivm, 3), + lp_build_const_int32(gallivm, 0), + }; + LLVMValueRef mask = LLVMConstVector(maskval, 4); + cvecval = LLVMBuildShuffleVector(gallivm-builder, cvecval, cvecval, + mask, ); + } emit_data-output[0] = cvecval; return; } -- 1.8.3.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] radeonsi: Do not set both inreg and byval
--- src/gallium/drivers/radeonsi/radeonsi_shader.c | 9 ++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/gallium/drivers/radeonsi/radeonsi_shader.c b/src/gallium/drivers/radeonsi/radeonsi_shader.c index ab996cc..209b77e 100644 --- a/src/gallium/drivers/radeonsi/radeonsi_shader.c +++ b/src/gallium/drivers/radeonsi/radeonsi_shader.c @@ -1655,16 +1655,19 @@ static void create_function(struct si_shader_context *si_shader_ctx) for (i = 0; i = last_sgpr; ++i) { LLVMValueRef P = LLVMGetParam(si_shader_ctx-radeon_bld.main_fn, i); - LLVMAddAttribute(P, LLVMInRegAttribute); + switch (i) { + default: + LLVMAddAttribute(P, LLVMInRegAttribute); + break; +#if HAVE_LLVM = 0x0304 /* We tell llvm that array inputs are passed by value to allow Sinking pass * to move load. Inputs are constant so this is fine. */ - switch (i) { - default: break; case SI_PARAM_CONST: case SI_PARAM_SAMPLER: case SI_PARAM_RESOURCE: LLVMAddAttribute(P, LLVMByValAttribute); break; +#endif } } -- 1.8.3.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH] R600/SI: Support byval arguments
llvm does not allow function attribute to flagged byval and inreg at the same time. It works currently because we don't verify our module in mesa, as our byval inreg arguments are considered byval by the sinking pass, and inreg by ISel pass. The patch : http://lists.freedesktop.org/archives/mesa-dev/2013-October/046022.html fixes the situation but requires the backend to provide a way to lower byval arguments. This patch provides such support. Vincent De : Tom Stellard t...@stellard.net À : Vincent Lejeune v...@ovi.com Cc : mesa-dev@lists.freedesktop.org Envoyé le : Jeudi 10 octobre 2013 15h19 Objet : Re: [Mesa-dev] [PATCH] R600/SI: Support byval arguments On Thu, Oct 10, 2013 at 12:04:16AM +0200, Vincent Lejeune wrote: What is the purpose of this change? -Tom --- lib/Target/R600/AMDGPUCallingConv.td | 7 ++- lib/Target/R600/SIISelLowering.cpp | 3 ++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/lib/Target/R600/AMDGPUCallingConv.td b/lib/Target/R600/AMDGPUCallingConv.td index d26be32..a194e6d 100644 --- a/lib/Target/R600/AMDGPUCallingConv.td +++ b/lib/Target/R600/AMDGPUCallingConv.td @@ -33,7 +33,12 @@ def CC_SI : CallingConv[ VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31 - ] + ], + + CCIfByValCCIfType[i64] , CCAssignToRegWithShadow + [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ], + [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ] + ]; diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 2174753..cd18154 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -158,7 +158,8 @@ SDValue SITargetLowering::LowerFormalArguments( const ISD::InputArg Arg = Ins[i]; // First check if it's a PS input addr - if (Info-ShaderType == ShaderType::PIXEL !Arg.Flags.isInReg()) { + if (Info-ShaderType == ShaderType::PIXEL !Arg.Flags.isInReg() + !Arg.Flags.isByVal()) { assert((PSInputNum = 15) Too many PS inputs!); -- 1.8.3.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] R600/SI: Support byval arguments
--- lib/Target/R600/AMDGPUCallingConv.td | 7 ++- lib/Target/R600/SIISelLowering.cpp | 3 ++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/lib/Target/R600/AMDGPUCallingConv.td b/lib/Target/R600/AMDGPUCallingConv.td index d26be32..a194e6d 100644 --- a/lib/Target/R600/AMDGPUCallingConv.td +++ b/lib/Target/R600/AMDGPUCallingConv.td @@ -33,7 +33,12 @@ def CC_SI : CallingConv[ VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31 - ] + ], + + CCIfByValCCIfType[i64] , CCAssignToRegWithShadow +[ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ], +[ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ] + ]; diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 2174753..cd18154 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -158,7 +158,8 @@ SDValue SITargetLowering::LowerFormalArguments( const ISD::InputArg Arg = Ins[i]; // First check if it's a PS input addr -if (Info-ShaderType == ShaderType::PIXEL !Arg.Flags.isInReg()) { +if (Info-ShaderType == ShaderType::PIXEL !Arg.Flags.isInReg() +!Arg.Flags.isByVal()) { assert((PSInputNum = 15) Too many PS inputs!); -- 1.8.3.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] radeonsi: Do not set both inreg and byval
--- src/gallium/drivers/radeonsi/radeonsi_shader.c | 7 --- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/gallium/drivers/radeonsi/radeonsi_shader.c b/src/gallium/drivers/radeonsi/radeonsi_shader.c index ab996cc..9d95997 100644 --- a/src/gallium/drivers/radeonsi/radeonsi_shader.c +++ b/src/gallium/drivers/radeonsi/radeonsi_shader.c @@ -1655,11 +1655,12 @@ static void create_function(struct si_shader_context *si_shader_ctx) for (i = 0; i = last_sgpr; ++i) { LLVMValueRef P = LLVMGetParam(si_shader_ctx-radeon_bld.main_fn, i); - LLVMAddAttribute(P, LLVMInRegAttribute); + switch (i) { + default: + LLVMAddAttribute(P, LLVMInRegAttribute); + break; /* We tell llvm that array inputs are passed by value to allow Sinking pass * to move load. Inputs are constant so this is fine. */ - switch (i) { - default: break; case SI_PARAM_CONST: case SI_PARAM_SAMPLER: case SI_PARAM_RESOURCE: -- 1.8.3.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] PATCH: R600/SI: Enable the verifier on most lit tests
3rd patch is reviewed-by:Vincent Lejeunevljn at ovi.com The first one Subject: [PATCH 1/4] R600/SI: Mark the EXEC register as reserved This prevents the machine verifier from complaining about uses of an undefined physical register. --- lib/Target/R600/SIRegisterInfo.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp index 5d12564..279ff33 100644 --- a/lib/Target/R600/SIRegisterInfo.cpp +++ b/lib/Target/R600/SIRegisterInfo.cpp @@ -25,7 +25,8 @@ SIRegisterInfo::SIRegisterInfo(AMDGPUTargetMachine tm) BitVector SIRegisterInfo::getReservedRegs(const MachineFunction MF) const { BitVector Reserved(getNumRegs()); - return Reserved; + Reserved.set(AMDGPU::EXEC); + return Reserved; } looks like a tab space increment there. With this fixed, this patch is reviewed-by:Vincent Lejeunevljn at ovi.com I'd like somebody else to have a look at the 2 others patches, I'm not familiar enough with SI isa for now. - Mail original - De : Tom Stellard t...@stellard.net À : llvm-comm...@cs.uiuc.edu Cc : mesa-dev@lists.freedesktop.org Envoyé le : Vendredi 4 octobre 2013 2h36 Objet : PATCH: R600/SI: Enable the verifier on most lit tests Hi, I would like to start using the machine verifier to help catch compiler bugs. I think it will be especially useful for making sure all our instructions have legal operands. The attached patches fix some simple machine verifier errors and enable it for most lit tests. Unfortunately, we cannot enable the machine verifier on tests that have branches, because the way IF and ELSE instructions are selected leaves us with a copy instruction following the IF and ELSE terminators, which violates one of the verifier rules. -Tom ___ llvm-commits mailing list llvm-comm...@cs.uiuc.edu http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] radeonsi: Allow Sinking pass to move preloaded const/res/sampl
This fixes a crash in Unigine Heaven 3.0, and probably in some others apps. --- src/gallium/drivers/radeonsi/radeonsi_shader.c | 20 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/src/gallium/drivers/radeonsi/radeonsi_shader.c b/src/gallium/drivers/radeonsi/radeonsi_shader.c index 97ed4e3..89c12c3 100644 --- a/src/gallium/drivers/radeonsi/radeonsi_shader.c +++ b/src/gallium/drivers/radeonsi/radeonsi_shader.c @@ -114,8 +114,12 @@ static LLVMValueRef build_indexed_load( { struct lp_build_context * base = si_shader_ctx-radeon_bld.soa.bld_base.base; + LLVMValueRef indices[2] = { + LLVMConstInt(LLVMInt64TypeInContext(base-gallivm-context), 0, false), + offset + }; LLVMValueRef computed_ptr = LLVMBuildGEP( - base-gallivm-builder, base_ptr, offset, 1, ); + base-gallivm-builder, base_ptr, indices, 2, ); LLVMValueRef result = LLVMBuildLoad(base-gallivm-builder, computed_ptr, ); LLVMSetMetadata(result, 1, si_shader_ctx-const_md); @@ -1578,9 +1582,13 @@ static void create_function(struct si_shader_context *si_shader_ctx) v2i32 = LLVMVectorType(i32, 2); v3i32 = LLVMVectorType(i32, 3); - params[SI_PARAM_CONST] = LLVMPointerType(LLVMVectorType(i8, 16), CONST_ADDR_SPACE); - params[SI_PARAM_SAMPLER] = params[SI_PARAM_CONST]; - params[SI_PARAM_RESOURCE] = LLVMPointerType(LLVMVectorType(i8, 32), CONST_ADDR_SPACE); + params[SI_PARAM_CONST] = LLVMPointerType(LLVMArrayType(LLVMVectorType(i8, 16), 64), CONST_ADDR_SPACE); + /* We assume at most 16 textures per program at the moment. +* This need probably need to be changed to support bindless textures */ + params[SI_PARAM_SAMPLER] = LLVMPointerType( + LLVMArrayType(LLVMVectorType(i8, 16), 16), CONST_ADDR_SPACE); + params[SI_PARAM_RESOURCE] = LLVMPointerType( + LLVMArrayType(LLVMVectorType(i8, 32), 16), CONST_ADDR_SPACE); switch (si_shader_ctx-type) { case TGSI_PROCESSOR_VERTEX: @@ -1647,6 +1655,10 @@ static void create_function(struct si_shader_context *si_shader_ctx) for (i = 0; i = last_sgpr; ++i) { LLVMValueRef P = LLVMGetParam(si_shader_ctx-radeon_bld.main_fn, i); LLVMAddAttribute(P, LLVMInRegAttribute); + /* We tell llvm that array inputs are passed by value to allow Sinking pass +* to move load. Inputs are constant so this is fine. */ + if (i 3) + LLVMAddAttribute(P, LLVMByValAttribute); } #if HAVE_LLVM = 0x0304 -- 1.8.3.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] R600/SI: Add SinkingPass before ISel
--- lib/Target/R600/AMDGPUTargetMachine.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp index d77cddd..f28f27a 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.cpp +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp @@ -120,6 +120,7 @@ bool AMDGPUPassConfig::addPreISel() { const AMDGPUSubtarget ST = TM-getSubtargetAMDGPUSubtarget(); addPass(createFlattenCFGPass()); + addPass(createSinkingPass()); if (ST.getGeneration() AMDGPUSubtarget::NORTHERN_ISLANDS) { addPass(createSITypeRewriter()); addPass(createStructurizeCFGPass()); -- 1.8.3.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] radeonsi: Allow Sinking pass to move preloaded const/res/sampl
This fixes a crash in Unigine Heaven 3.0, and probably in some others apps. --- src/gallium/drivers/radeonsi/radeonsi_shader.c | 27 ++ src/gallium/drivers/radeonsi/si_state.h| 1 + 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/src/gallium/drivers/radeonsi/radeonsi_shader.c b/src/gallium/drivers/radeonsi/radeonsi_shader.c index 97ed4e3..ab996cc 100644 --- a/src/gallium/drivers/radeonsi/radeonsi_shader.c +++ b/src/gallium/drivers/radeonsi/radeonsi_shader.c @@ -114,8 +114,12 @@ static LLVMValueRef build_indexed_load( { struct lp_build_context * base = si_shader_ctx-radeon_bld.soa.bld_base.base; + LLVMValueRef indices[2] = { + LLVMConstInt(LLVMInt64TypeInContext(base-gallivm-context), 0, false), + offset + }; LLVMValueRef computed_ptr = LLVMBuildGEP( - base-gallivm-builder, base_ptr, offset, 1, ); + base-gallivm-builder, base_ptr, indices, 2, ); LLVMValueRef result = LLVMBuildLoad(base-gallivm-builder, computed_ptr, ); LLVMSetMetadata(result, 1, si_shader_ctx-const_md); @@ -1578,9 +1582,14 @@ static void create_function(struct si_shader_context *si_shader_ctx) v2i32 = LLVMVectorType(i32, 2); v3i32 = LLVMVectorType(i32, 3); - params[SI_PARAM_CONST] = LLVMPointerType(LLVMVectorType(i8, 16), CONST_ADDR_SPACE); - params[SI_PARAM_SAMPLER] = params[SI_PARAM_CONST]; - params[SI_PARAM_RESOURCE] = LLVMPointerType(LLVMVectorType(i8, 32), CONST_ADDR_SPACE); + params[SI_PARAM_CONST] = LLVMPointerType( + LLVMArrayType(LLVMVectorType(i8, 16), NUM_CONST_BUFFERS), CONST_ADDR_SPACE); + /* We assume at most 16 textures per program at the moment. +* This need probably need to be changed to support bindless textures */ + params[SI_PARAM_SAMPLER] = LLVMPointerType( + LLVMArrayType(LLVMVectorType(i8, 16), NUM_SAMPLER_VIEWS), CONST_ADDR_SPACE); + params[SI_PARAM_RESOURCE] = LLVMPointerType( + LLVMArrayType(LLVMVectorType(i8, 32), NUM_SAMPLER_STATES), CONST_ADDR_SPACE); switch (si_shader_ctx-type) { case TGSI_PROCESSOR_VERTEX: @@ -1647,6 +1656,16 @@ static void create_function(struct si_shader_context *si_shader_ctx) for (i = 0; i = last_sgpr; ++i) { LLVMValueRef P = LLVMGetParam(si_shader_ctx-radeon_bld.main_fn, i); LLVMAddAttribute(P, LLVMInRegAttribute); + /* We tell llvm that array inputs are passed by value to allow Sinking pass +* to move load. Inputs are constant so this is fine. */ + switch (i) { + default: break; + case SI_PARAM_CONST: + case SI_PARAM_SAMPLER: + case SI_PARAM_RESOURCE: + LLVMAddAttribute(P, LLVMByValAttribute); + break; + } } #if HAVE_LLVM = 0x0304 diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 94a1521..6dbf880 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -107,6 +107,7 @@ union si_state { */ #define FMASK_TEX_OFFSET NUM_TEX_UNITS #define NUM_SAMPLER_VIEWS (FMASK_TEX_OFFSET+NUM_TEX_UNITS) +#define NUM_SAMPLER_STATES NUM_TEX_UNITS #define NUM_CONST_BUFFERS 2 -- 1.8.3.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] R600: Add a ldptr intrinsic to support MSAA.
--- lib/Target/R600/R600ISelLowering.cpp | 6 +- lib/Target/R600/R600Instructions.td | 4 lib/Target/R600/R600Intrinsics.td| 1 + 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index 126db73..a6778a4 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -590,7 +590,8 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG DAG) const case AMDGPUIntrinsic::R600_txf: case AMDGPUIntrinsic::R600_txq: case AMDGPUIntrinsic::R600_ddx: -case AMDGPUIntrinsic::R600_ddy: { +case AMDGPUIntrinsic::R600_ddy: +case AMDGPUIntrinsic::R600_ldptr: { unsigned TextureOp; switch (IntrinsicID) { case AMDGPUIntrinsic::R600_tex: @@ -623,6 +624,9 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG DAG) const case AMDGPUIntrinsic::R600_ddy: TextureOp = 9; break; + case AMDGPUIntrinsic::R600_ldptr: +TextureOp = 10; +break; default: llvm_unreachable(Unknow Texture Operation); } diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index 82ecbad..9dc9303 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -881,6 +881,9 @@ def TEX_SAMPLE_C_L : R600_TEX 0x19, TEX_SAMPLE_C_L; def TEX_SAMPLE_LB : R600_TEX 0x12, TEX_SAMPLE_LB; def TEX_SAMPLE_C_LB : R600_TEX 0x1A, TEX_SAMPLE_C_LB; def TEX_LD : R600_TEX 0x03, TEX_LD; +def TEX_LDPTR : R600_TEX 0x03, TEX_LDPTR { + let Inst{6-5} = 1; +} def TEX_GET_TEXTURE_RESINFO : R600_TEX 0x04, TEX_GET_TEXTURE_RESINFO; def TEX_GET_GRADIENTS_H : R600_TEX 0x07, TEX_GET_GRADIENTS_H; def TEX_GET_GRADIENTS_V : R600_TEX 0x08, TEX_GET_GRADIENTS_V; @@ -899,6 +902,7 @@ defm : TexPattern6, TEX_LD, v4i32; defm : TexPattern7, TEX_GET_TEXTURE_RESINFO, v4i32; defm : TexPattern8, TEX_GET_GRADIENTS_H; defm : TexPattern9, TEX_GET_GRADIENTS_V; +defm : TexPattern10, TEX_LDPTR, v4i32; //===--===// // Helper classes for common instructions diff --git a/lib/Target/R600/R600Intrinsics.td b/lib/Target/R600/R600Intrinsics.td index 58d86b6..b5cb369 100644 --- a/lib/Target/R600/R600Intrinsics.td +++ b/lib/Target/R600/R600Intrinsics.td @@ -52,6 +52,7 @@ let TargetPrefix = R600, isTarget = 1 in { def int_R600_txb : TextureIntrinsicFloatInput; def int_R600_txbc : TextureIntrinsicFloatInput; def int_R600_txf : TextureIntrinsicInt32Input; + def int_R600_ldptr : TextureIntrinsicInt32Input; def int_R600_txq : TextureIntrinsicInt32Input; def int_R600_ddx : TextureIntrinsicFloatInput; def int_R600_ddy : TextureIntrinsicFloatInput; -- 1.8.3.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/4] r600g/llvm: fix txq for texture buffer
--- src/gallium/drivers/r600/r600_llvm.c | 7 +-- src/gallium/drivers/r600/r600_shader.c | 1 + src/gallium/drivers/radeon/radeon_llvm.h | 1 + 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c index 03a68e4..54291a1 100644 --- a/src/gallium/drivers/r600/r600_llvm.c +++ b/src/gallium/drivers/r600/r600_llvm.c @@ -23,6 +23,7 @@ #define CONSTANT_BUFFER_0_ADDR_SPACE 8 #define CONSTANT_BUFFER_1_ADDR_SPACE (CONSTANT_BUFFER_0_ADDR_SPACE + R600_UCP_CONST_BUFFER) #define CONSTANT_TXQ_BUFFER (CONSTANT_BUFFER_0_ADDR_SPACE + R600_TXQ_CONST_BUFFER) +#define LLVM_R600_BUFFER_INFO_CONST_BUFFER (CONSTANT_BUFFER_0_ADDR_SPACE + R600_BUFFER_INFO_CONST_BUFFER) static LLVMValueRef llvm_load_const_buffer( struct lp_build_tgsi_context * bld_base, @@ -410,8 +411,10 @@ static void llvm_emit_tex( if (emit_data-inst-Texture.Texture == TGSI_TEXTURE_BUFFER) { switch (emit_data-inst-Instruction.Opcode) { case TGSI_OPCODE_TXQ: { - LLVMValueRef offset = lp_build_const_int32(bld_base-base.gallivm, 1); - LLVMValueRef cvecval = llvm_load_const_buffer(bld_base, offset, R600_BUFFER_INFO_CONST_BUFFER); + struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base); + ctx-uses_tex_buffers = true; + LLVMValueRef offset = lp_build_const_int32(bld_base-base.gallivm, 0); + LLVMValueRef cvecval = llvm_load_const_buffer(bld_base, offset, LLVM_R600_BUFFER_INFO_CONST_BUFFER); emit_data-output[0] = cvecval; return; } diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index ce15cd7..e8e1333 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -1139,6 +1139,7 @@ static int r600_shader_from_tgsi(struct r600_screen *rscreen, radeon_llvm_ctx.alpha_to_one = key.alpha_to_one; mod = r600_tgsi_llvm(radeon_llvm_ctx, tokens); ctx.shader-has_txq_cube_array_z_comp = radeon_llvm_ctx.has_txq_cube_array_z_comp; + ctx.shader-uses_tex_buffers = radeon_llvm_ctx.uses_tex_buffers; if (r600_llvm_compile(mod, rscreen-b.family, ctx.bc, use_kill, dump)) { radeon_llvm_dispose(radeon_llvm_ctx); diff --git a/src/gallium/drivers/radeon/radeon_llvm.h b/src/gallium/drivers/radeon/radeon_llvm.h index 14a8c34..345ae70 100644 --- a/src/gallium/drivers/radeon/radeon_llvm.h +++ b/src/gallium/drivers/radeon/radeon_llvm.h @@ -67,6 +67,7 @@ struct radeon_llvm_context { unsigned fs_color_all; unsigned alpha_to_one; unsigned has_txq_cube_array_z_comp; + unsigned uses_tex_buffers; /*=== Front end configuration ===*/ -- 1.8.3.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/4] r600g/llvm: fix sample cube shadow
--- src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c index 8ff9abd..ac2e511 100644 --- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c +++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c @@ -654,7 +654,8 @@ void radeon_llvm_emit_prepare_cube_coords( opcode == TGSI_OPCODE_TXB2 || opcode == TGSI_OPCODE_TXL2) { coords[3] = coords_arg[4]; - } else if (opcode == TGSI_OPCODE_TXB || + } else if (opcode == TGSI_OPCODE_TEX || + opcode == TGSI_OPCODE_TXB || opcode == TGSI_OPCODE_TXL) { coords[3] = coords_arg[3]; } -- 1.8.3.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH] R600: Don't use trans slot for instructions that read LDS source registers
A few comments below, otherwise : reviewed-by: Vincent Lejeunevljn at ovi.com - Mail original - De : Tom Stellard t...@stellard.net À : llvm-comm...@cs.uiuc.edu Cc : mesa-dev@lists.freedesktop.org; Tom Stellard thomas.stell...@amd.com Envoyé le : Vendredi 6 septembre 2013 0h23 Objet : [PATCH] R600: Don't use trans slot for instructions that read LDS source registers From: Tom Stellard thomas.stell...@amd.com This fixes some regressions in the piglit local memory store tests introduced by recent commits which made the scheduler aware of the trans slot. It's not possible to test this using lit, because there is no way to determine from the assembly dumps whether or not an instruction is in the trans slot. Even if this were possible, the test would be highly sensitive to changes in the scheduler and might generate confusing false negatives. --- lib/Target/R600/R600InstrInfo.cpp | 17 + lib/Target/R600/R600InstrInfo.h | 1 + lib/Target/R600/R600MachineScheduler.cpp | 5 + lib/Target/R600/R600Packetizer.cpp | 5 + lib/Target/R600/R600RegisterInfo.td | 10 +- 5 files changed, 37 insertions(+), 1 deletion(-) diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp index 0e7cfb4..60a3f7d 100644 --- a/lib/Target/R600/R600InstrInfo.cpp +++ b/lib/Target/R600/R600InstrInfo.cpp @@ -204,6 +204,23 @@ bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const { } } +bool R600InstrInfo::readsLDSSrcReg(const MachineInstr *MI) const { + if (!isALUInstr(MI-getOpcode())) { + return false; + } + for (MachineInstr::const_mop_iterator I = MI-operands_begin(), + E = MI-operands_end(); I != E; ++I) { + if (!I-isReg() || !I-isUse() || + TargetRegisterInfo::isVirtualRegister(I-getReg())) { + continue; + } + if (AMDGPU::R600_LDS_SRC_REGRegClass.contains(I-getReg())) { + return true; + } The bracket in this if statements and in the previous one are unneeded. + } + return false; +} + int R600InstrInfo::getSrcIdx(unsigned Opcode, unsigned SrcNum) const { static const unsigned OpTable[] = { AMDGPU::OpName::src0, diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h index 24cc43d..0d1ffc8 100644 --- a/lib/Target/R600/R600InstrInfo.h +++ b/lib/Target/R600/R600InstrInfo.h @@ -78,6 +78,7 @@ namespace llvm { bool usesTextureCache(const MachineInstr *MI) const; bool mustBeLastInClause(unsigned Opcode) const; + bool readsLDSSrcReg(const MachineInstr *MI) const; /// \returns The operand index for the given source number. Legal values /// for SrcNum are 0, 1, and 2. diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp index 0499dd5..f67ba89 100644 --- a/lib/Target/R600/R600MachineScheduler.cpp +++ b/lib/Target/R600/R600MachineScheduler.cpp @@ -314,6 +314,11 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const { if (regBelongsToClass(DestReg, AMDGPU::R600_Reg128RegClass)) return AluT_XYZW; + // LDS src registers cannot be used in the Trans slot. + if (TII-readsLDSSrcReg(MI)) { + return AluT_XYZW; + } Here too + return AluAny; } diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp index 6c70052..ee256d5 100644 --- a/lib/Target/R600/R600Packetizer.cpp +++ b/lib/Target/R600/R600Packetizer.cpp @@ -272,6 +272,11 @@ public: return false; } + // We cannot read LDS source registrs from the Trans slot. + if (isTransSlot TII-readsLDSSrcReg(MI)) { + return false; + } And here too + CurrentPacketMIs.pop_back(); return true; } diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td index fa987cf..514427e 100644 --- a/lib/Target/R600/R600RegisterInfo.td +++ b/lib/Target/R600/R600RegisterInfo.td @@ -95,6 +95,12 @@ foreach Index = 448-480 in { // Special Registers +def OQA : R600RegOQA, 219; +def OQB : R600RegOQB, 220; +def OQAP : R600RegOQAP, 221; +def OQBP : R600RegOQAP, 222; +def LDS_DIRECT_A : R600RegLDS_DIRECT_A, 223; +def LDS_DIRECT_B : R600RegLDS_DIRECT_B, 224; def ZERO : R600Reg0.0, 248; def ONE : R600Reg1.0, 249; def NEG_ONE : R600Reg-1.0, 249; @@ -115,7 +121,6 @@ def PRED_SEL_OFF: R600RegPred_sel_off, 0; def PRED_SEL_ZERO : R600RegPred_sel_zero, 2; def PRED_SEL_ONE : R600RegPred_sel_one, 3; def AR_X : R600RegAR.x, 0; -def OQAP : R600RegOQAP, 221; def R600_ArrayBase : RegisterClass AMDGPU, [f32, i32], 32, (add (sequence ArrayBase%u, 448, 480)); @@ -130,6 +135,9 @@ let isAllocatable = 0 in { // XXX: Only use the X channel, until we support wider stack widths def R600_Addr : RegisterClass AMDGPU, [i32], 127, (add (sequence Addr%u_X, 0, 127));
Re: [Mesa-dev] R600 Patches: KCache kernel arguments and 24-bit arithmetic
The whole serie is : reviewed-by:Vincent Lejeune vljn at ovi.com In a future patch we might also remove the ISD::BUILD_VECTOR case in the Select() function and use a tablegen pattern ; I wrote it because we lowered r600.load.input intrinsic to a raw register ; however now we lower it to a copy from a register which should be convertible to a REG_SEQUENCE. Vincent - Mail original - De : Tom Stellard t...@stellard.net À : llvm-comm...@cs.uiuc.edu Cc : mesa-dev@lists.freedesktop.org Envoyé le : Mardi 25 juin 2013 23h37 Objet : [Mesa-dev] R600 Patches: KCache kernel arguments and 24-bit arithmetic Hi, The attached patches clean up kernel argument handling for both R600 and SI and for R600 makes it possible to read arguments through the KCache. There are also patches that add support for the 24-bit arithmetic instructions (MAD_UINT24, MAD_INT24, MUL_UINT24, and MUL_INT24). In order to test these patches with you will also need to apply the corresponding Mesa patches which will be on the mailing list soon. -Tom ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] R600: Bugfixes
Hi, these patches fix 2 bugs in R600 backend. The first one use the rv710/rv730 correct encoding for TEX clause with more than 8 instructions. This bug has been spoted there : https://bugs.freedesktop.org/show_bug.cgi?id=64257 The other patch fix a typo that causes instructions not to use PV/PS register when R600Packetizers evaluates read port limitations. It prevents some bundling opportunities in some (not so frequent) situation. Vincent 0001-R600-Properly-set-COUNT_3-bit-in-TEX-clause-initiati.patch Description: Binary data 0002-R600-PV-stores-Reg-id-not-index.patch Description: Binary data ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] R600 Patches: Add support for the local address space
Hi, Thank for your work on this ! Patch 2, 4 and 5 have my rb. diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp index b9da74c..6de47f7 100644 --- a/lib/Target/R600/R600InstrInfo.cpp +++ b/lib/Target/R600/R600InstrInfo.cpp @@ -133,6 +133,12 @@ bool R600InstrInfo::isCubeOp(unsigned Opcode) const { bool R600InstrInfo::isALUInstr(unsigned Opcode) const { unsigned TargetFlags = get(Opcode).TSFlags; + return (TargetFlags R600_InstFlag::ALU_INST); +} + +bool R600InstrInfo::hasInstrModifiers(unsigned Opcode) const { + unsigned TargetFlags = get(Opcode).TSFlags; + return ((TargetFlags R600_InstFlag::OP1) | (TargetFlags R600_InstFlag::OP2) | (TargetFlags R600_InstFlag::OP3)); Function prototype is not defined here (it is defined in patch 5). diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp index a330d88..acc1b4d 100644 --- a/lib/Target/R600/R600MachineScheduler.cpp +++ b/lib/Target/R600/R600MachineScheduler.cpp @@ -269,10 +269,14 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const { } // Does the instruction take a whole IG ? +// XXX: Is it possible to add a helper function in R600InstrInfo that can +// be used here and in R600PacketizerList::isSoloInstruction() ? if(TII-isVector(*MI) || TII-isCubeOp(MI-getOpcode()) || -TII-isReductionOp(MI-getOpcode())) +TII-isReductionOp(MI-getOpcode()) || +MI-getOpcode() == AMDGPU::GROUP_BARRIER) { return AluT_XYZW; +} I'm not sure it'll factorize that much code ; R600Packetizer is called after cube/reduction op are lowered by R600Expand pass and thus the isVector/ReductionOp check is useless. I may have left some debug code in isSoloInstruction code though. - Mail original - De : Tom Stellard t...@stellard.net À : llvm-comm...@cs.uiuc.edu Cc : mesa-dev@lists.freedesktop.org Envoyé le : Jeudi 13 juin 2013 2h42 Objet : [Mesa-dev] R600 Patches: Add support for the local address space Hi, The attached patches add support for local address space on Evergreen / Northern Islands GPUs. Please Review. -Tom ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/2] r600g/llvm: fix txq for texture buffer
--- src/gallium/drivers/r600/r600_llvm.c | 7 +-- src/gallium/drivers/r600/r600_shader.c | 1 + src/gallium/drivers/radeon/radeon_llvm.h | 1 + 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c index c1809b3..77c6abb 100644 --- a/src/gallium/drivers/r600/r600_llvm.c +++ b/src/gallium/drivers/r600/r600_llvm.c @@ -23,6 +23,7 @@ #define CONSTANT_BUFFER_0_ADDR_SPACE 8 #define CONSTANT_BUFFER_1_ADDR_SPACE (CONSTANT_BUFFER_0_ADDR_SPACE + R600_UCP_CONST_BUFFER) #define CONSTANT_TXQ_BUFFER (CONSTANT_BUFFER_0_ADDR_SPACE + R600_TXQ_CONST_BUFFER) +#define LLVM_R600_BUFFER_INFO_CONST_BUFFER (CONSTANT_BUFFER_0_ADDR_SPACE + R600_BUFFER_INFO_CONST_BUFFER) static LLVMValueRef llvm_load_const_buffer( struct lp_build_tgsi_context * bld_base, @@ -410,8 +411,10 @@ static void llvm_emit_tex( if (emit_data-inst-Texture.Texture == TGSI_TEXTURE_BUFFER) { switch (emit_data-inst-Instruction.Opcode) { case TGSI_OPCODE_TXQ: { - LLVMValueRef offset = lp_build_const_int32(bld_base-base.gallivm, 1); - LLVMValueRef cvecval = llvm_load_const_buffer(bld_base, offset, R600_BUFFER_INFO_CONST_BUFFER); + struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base); + ctx-uses_tex_buffers = true; + LLVMValueRef offset = lp_build_const_int32(bld_base-base.gallivm, 0); + LLVMValueRef cvecval = llvm_load_const_buffer(bld_base, offset, LLVM_R600_BUFFER_INFO_CONST_BUFFER); emit_data-output[0] = cvecval; return; } diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 81ed3ce..2f126c6 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -1170,6 +1170,7 @@ static int r600_shader_from_tgsi(struct r600_screen *rscreen, radeon_llvm_ctx.alpha_to_one = key.alpha_to_one; mod = r600_tgsi_llvm(radeon_llvm_ctx, tokens); ctx.shader-has_txq_cube_array_z_comp = radeon_llvm_ctx.has_txq_cube_array_z_comp; + ctx.shader-uses_tex_buffers = radeon_llvm_ctx.uses_tex_buffers; if (r600_llvm_compile(mod, rscreen-family, ctx.bc, use_kill, dump)) { radeon_llvm_dispose(radeon_llvm_ctx); diff --git a/src/gallium/drivers/radeon/radeon_llvm.h b/src/gallium/drivers/radeon/radeon_llvm.h index 14a8c34..345ae70 100644 --- a/src/gallium/drivers/radeon/radeon_llvm.h +++ b/src/gallium/drivers/radeon/radeon_llvm.h @@ -67,6 +67,7 @@ struct radeon_llvm_context { unsigned fs_color_all; unsigned alpha_to_one; unsigned has_txq_cube_array_z_comp; + unsigned uses_tex_buffers; /*=== Front end configuration ===*/ -- 1.8.2.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/2] r600g/llvm: fix sample cube shadow
--- src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c index 3f7e79f..f49170d 100644 --- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c +++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c @@ -654,7 +654,8 @@ void radeon_llvm_emit_prepare_cube_coords( opcode == TGSI_OPCODE_TXB2 || opcode == TGSI_OPCODE_TXL2) { coords[3] = coords_arg[4]; - } else if (opcode == TGSI_OPCODE_TXB || + } else if (opcode == TGSI_OPCODE_TEX || + opcode == TGSI_OPCODE_TXB || opcode == TGSI_OPCODE_TXL) { coords[3] = coords_arg[3]; } -- 1.8.2.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/4] r600g/llvm: Factorize code loading from const buffer.
--- src/gallium/drivers/r600/r600_llvm.c | 51 +--- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c index c6c9123..26d40a2 100644 --- a/src/gallium/drivers/r600/r600_llvm.c +++ b/src/gallium/drivers/r600/r600_llvm.c @@ -23,30 +23,40 @@ #define CONSTANT_BUFFER_0_ADDR_SPACE 8 #define CONSTANT_BUFFER_1_ADDR_SPACE (CONSTANT_BUFFER_0_ADDR_SPACE + R600_UCP_CONST_BUFFER) +static LLVMValueRef llvm_load_const_buffer( + struct lp_build_tgsi_context * bld_base, + LLVMValueRef OffsetValue, + unsigned ConstantAddressSpace) +{ + LLVMValueRef offset[2] = { + LLVMConstInt(LLVMInt64TypeInContext(bld_base-base.gallivm-context), 0, false), + OffsetValue + }; + + LLVMTypeRef const_ptr_type = LLVMPointerType(LLVMArrayType(LLVMVectorType(bld_base-base.elem_type, 4), 1024), + ConstantAddressSpace); + LLVMValueRef const_ptr = LLVMBuildIntToPtr(bld_base-base.gallivm-builder, lp_build_const_int32(bld_base-base.gallivm, 0), const_ptr_type, ); + LLVMValueRef ptr = LLVMBuildGEP(bld_base-base.gallivm-builder, const_ptr, offset, 2, ); + return LLVMBuildLoad(bld_base-base.gallivm-builder, ptr, ); +} + static LLVMValueRef llvm_fetch_const( struct lp_build_tgsi_context * bld_base, const struct tgsi_full_src_register *reg, enum tgsi_opcode_type type, unsigned swizzle) { - LLVMValueRef offset[2] = { - LLVMConstInt(LLVMInt64TypeInContext(bld_base-base.gallivm-context), 0, false), - lp_build_const_int32(bld_base-base.gallivm, reg-Register.Index) - }; + LLVMValueRef offset = lp_build_const_int32(bld_base-base.gallivm, reg-Register.Index); if (reg-Register.Indirect) { struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base); LLVMValueRef index = LLVMBuildLoad(bld_base-base.gallivm-builder, bld-addr[reg-Indirect.Index][reg-Indirect.Swizzle], ); - offset[1] = LLVMBuildAdd(bld_base-base.gallivm-builder, offset[1], index, ); + offset = LLVMBuildAdd(bld_base-base.gallivm-builder, offset, index, ); } unsigned ConstantAddressSpace = CONSTANT_BUFFER_0_ADDR_SPACE ; if (reg-Register.Dimension) { ConstantAddressSpace += reg-Dimension.Index; } - LLVMTypeRef const_ptr_type = LLVMPointerType(LLVMArrayType(LLVMVectorType(bld_base-base.elem_type, 4), 1024), - ConstantAddressSpace); - LLVMValueRef const_ptr = LLVMBuildIntToPtr(bld_base-base.gallivm-builder, lp_build_const_int32(bld_base-base.gallivm, 0), const_ptr_type, ); - LLVMValueRef ptr = LLVMBuildGEP(bld_base-base.gallivm-builder, const_ptr, offset, 2, ); - LLVMValueRef cvecval = LLVMBuildLoad(bld_base-base.gallivm-builder, ptr, ); + LLVMValueRef cvecval = llvm_load_const_buffer(bld_base, offset, ConstantAddressSpace); LLVMValueRef cval = LLVMBuildExtractElement(bld_base-base.gallivm-builder, cvecval, lp_build_const_int32(bld_base-base.gallivm, swizzle), ); return bitcast(bld_base, type, cval); } @@ -250,14 +260,8 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base) LLVMValueRef adjusted_elements[4]; for (reg_index = 0; reg_index 2; reg_index ++) { for (chan = 0; chan TGSI_NUM_CHANNELS; chan++) { - LLVMValueRef offset[2] = { - LLVMConstInt(LLVMInt64TypeInContext(bld_base-base.gallivm-context), 0, false), - lp_build_const_int32(bld_base-base.gallivm, reg_index * 4 + chan) - }; - LLVMTypeRef const_ptr_type = LLVMPointerType(LLVMArrayType(LLVMVectorType(bld_base-base.elem_type, 4), 1024), CONSTANT_BUFFER_1_ADDR_SPACE); - LLVMValueRef const_ptr = LLVMBuildIntToPtr(bld_base-base.gallivm-builder, lp_build_const_int32(bld_base-base.gallivm, 0), const_ptr_type, ); - LLVMValueRef ptr = LLVMBuildGEP(bld_base-base.gallivm-builder, const_ptr, offset, 2, ); - LLVMValueRef base_vector = LLVMBuildLoad(bld_base-base.gallivm-builder, ptr, ); + LLVMValueRef offset = lp_build_const_int32(bld_base-base.gallivm, reg_index * 4 + chan); + LLVMValueRef base_vector = llvm_load_const_buffer(bld_base, offset, CONSTANT_BUFFER_1_ADDR_SPACE);
[Mesa-dev] [PATCH 2/4] r600g/llvm: Fix cubearray textureSize
--- src/gallium/drivers/r600/r600_llvm.c | 15 +++ src/gallium/drivers/r600/r600_shader.c | 1 + src/gallium/drivers/radeon/radeon_llvm.h | 1 + 3 files changed, 17 insertions(+) diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c index 26d40a2..3d2c492 100644 --- a/src/gallium/drivers/r600/r600_llvm.c +++ b/src/gallium/drivers/r600/r600_llvm.c @@ -22,6 +22,7 @@ #define CONSTANT_BUFFER_0_ADDR_SPACE 8 #define CONSTANT_BUFFER_1_ADDR_SPACE (CONSTANT_BUFFER_0_ADDR_SPACE + R600_UCP_CONST_BUFFER) +#define CONSTANT_TXQ_BUFFER (CONSTANT_BUFFER_0_ADDR_SPACE + R600_TXQ_CONST_BUFFER) static LLVMValueRef llvm_load_const_buffer( struct lp_build_tgsi_context * bld_base, @@ -471,6 +472,20 @@ static void llvm_emit_tex( emit_data-output[0] = build_intrinsic(gallivm-builder, action-intr_name, emit_data-dst_type, args, c, LLVMReadNoneAttribute); + + if (emit_data-inst-Instruction.Opcode == TGSI_OPCODE_TXQ + ((emit_data-inst-Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || + emit_data-inst-Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY))) + if (emit_data-inst-Dst[0].Register.WriteMask 4) { + LLVMValueRef offset = lp_build_const_int32(bld_base-base.gallivm, 0); + LLVMValueRef ZLayer = LLVMBuildExtractElement(gallivm-builder, + llvm_load_const_buffer(bld_base, offset, CONSTANT_TXQ_BUFFER), + lp_build_const_int32(gallivm, 0), ); + + emit_data-output[0] = LLVMBuildInsertElement(gallivm-builder, emit_data-output[0], ZLayer, lp_build_const_int32(gallivm, 2), ); + struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base); + ctx-has_txq_cube_array_z_comp = true; + } } static void emit_cndlt( diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 4d74db0..81ed3ce 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -1169,6 +1169,7 @@ static int r600_shader_from_tgsi(struct r600_screen *rscreen, radeon_llvm_ctx.clip_vertex = ctx.cv_output; radeon_llvm_ctx.alpha_to_one = key.alpha_to_one; mod = r600_tgsi_llvm(radeon_llvm_ctx, tokens); + ctx.shader-has_txq_cube_array_z_comp = radeon_llvm_ctx.has_txq_cube_array_z_comp; if (r600_llvm_compile(mod, rscreen-family, ctx.bc, use_kill, dump)) { radeon_llvm_dispose(radeon_llvm_ctx); diff --git a/src/gallium/drivers/radeon/radeon_llvm.h b/src/gallium/drivers/radeon/radeon_llvm.h index 1d4bd45..14a8c34 100644 --- a/src/gallium/drivers/radeon/radeon_llvm.h +++ b/src/gallium/drivers/radeon/radeon_llvm.h @@ -66,6 +66,7 @@ struct radeon_llvm_context { unsigned color_buffer_count; unsigned fs_color_all; unsigned alpha_to_one; + unsigned has_txq_cube_array_z_comp; /*=== Front end configuration ===*/ -- 1.8.2.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/4] r600g/llvm: Factorize code loading from const buffer.
--- src/gallium/drivers/r600/r600_llvm.c | 51 +--- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c index c6c9123..26d40a2 100644 --- a/src/gallium/drivers/r600/r600_llvm.c +++ b/src/gallium/drivers/r600/r600_llvm.c @@ -23,30 +23,40 @@ #define CONSTANT_BUFFER_0_ADDR_SPACE 8 #define CONSTANT_BUFFER_1_ADDR_SPACE (CONSTANT_BUFFER_0_ADDR_SPACE + R600_UCP_CONST_BUFFER) +static LLVMValueRef llvm_load_const_buffer( + struct lp_build_tgsi_context * bld_base, + LLVMValueRef OffsetValue, + unsigned ConstantAddressSpace) +{ + LLVMValueRef offset[2] = { + LLVMConstInt(LLVMInt64TypeInContext(bld_base-base.gallivm-context), 0, false), + OffsetValue + }; + + LLVMTypeRef const_ptr_type = LLVMPointerType(LLVMArrayType(LLVMVectorType(bld_base-base.elem_type, 4), 1024), + ConstantAddressSpace); + LLVMValueRef const_ptr = LLVMBuildIntToPtr(bld_base-base.gallivm-builder, lp_build_const_int32(bld_base-base.gallivm, 0), const_ptr_type, ); + LLVMValueRef ptr = LLVMBuildGEP(bld_base-base.gallivm-builder, const_ptr, offset, 2, ); + return LLVMBuildLoad(bld_base-base.gallivm-builder, ptr, ); +} + static LLVMValueRef llvm_fetch_const( struct lp_build_tgsi_context * bld_base, const struct tgsi_full_src_register *reg, enum tgsi_opcode_type type, unsigned swizzle) { - LLVMValueRef offset[2] = { - LLVMConstInt(LLVMInt64TypeInContext(bld_base-base.gallivm-context), 0, false), - lp_build_const_int32(bld_base-base.gallivm, reg-Register.Index) - }; + LLVMValueRef offset = lp_build_const_int32(bld_base-base.gallivm, reg-Register.Index); if (reg-Register.Indirect) { struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base); LLVMValueRef index = LLVMBuildLoad(bld_base-base.gallivm-builder, bld-addr[reg-Indirect.Index][reg-Indirect.Swizzle], ); - offset[1] = LLVMBuildAdd(bld_base-base.gallivm-builder, offset[1], index, ); + offset = LLVMBuildAdd(bld_base-base.gallivm-builder, offset, index, ); } unsigned ConstantAddressSpace = CONSTANT_BUFFER_0_ADDR_SPACE ; if (reg-Register.Dimension) { ConstantAddressSpace += reg-Dimension.Index; } - LLVMTypeRef const_ptr_type = LLVMPointerType(LLVMArrayType(LLVMVectorType(bld_base-base.elem_type, 4), 1024), - ConstantAddressSpace); - LLVMValueRef const_ptr = LLVMBuildIntToPtr(bld_base-base.gallivm-builder, lp_build_const_int32(bld_base-base.gallivm, 0), const_ptr_type, ); - LLVMValueRef ptr = LLVMBuildGEP(bld_base-base.gallivm-builder, const_ptr, offset, 2, ); - LLVMValueRef cvecval = LLVMBuildLoad(bld_base-base.gallivm-builder, ptr, ); + LLVMValueRef cvecval = llvm_load_const_buffer(bld_base, offset, ConstantAddressSpace); LLVMValueRef cval = LLVMBuildExtractElement(bld_base-base.gallivm-builder, cvecval, lp_build_const_int32(bld_base-base.gallivm, swizzle), ); return bitcast(bld_base, type, cval); } @@ -250,14 +260,8 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base) LLVMValueRef adjusted_elements[4]; for (reg_index = 0; reg_index 2; reg_index ++) { for (chan = 0; chan TGSI_NUM_CHANNELS; chan++) { - LLVMValueRef offset[2] = { - LLVMConstInt(LLVMInt64TypeInContext(bld_base-base.gallivm-context), 0, false), - lp_build_const_int32(bld_base-base.gallivm, reg_index * 4 + chan) - }; - LLVMTypeRef const_ptr_type = LLVMPointerType(LLVMArrayType(LLVMVectorType(bld_base-base.elem_type, 4), 1024), CONSTANT_BUFFER_1_ADDR_SPACE); - LLVMValueRef const_ptr = LLVMBuildIntToPtr(bld_base-base.gallivm-builder, lp_build_const_int32(bld_base-base.gallivm, 0), const_ptr_type, ); - LLVMValueRef ptr = LLVMBuildGEP(bld_base-base.gallivm-builder, const_ptr, offset, 2, ); - LLVMValueRef base_vector = LLVMBuildLoad(bld_base-base.gallivm-builder, ptr, ); + LLVMValueRef offset = lp_build_const_int32(bld_base-base.gallivm, reg_index * 4 + chan); + LLVMValueRef base_vector = llvm_load_const_buffer(bld_base, offset, CONSTANT_BUFFER_1_ADDR_SPACE);
[Mesa-dev] [PATCH 4/4] r600g/llvm: fix cubemap lod/bias
--- src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c index 0629b89..3f7e79f 100644 --- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c +++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c @@ -654,6 +654,9 @@ void radeon_llvm_emit_prepare_cube_coords( opcode == TGSI_OPCODE_TXB2 || opcode == TGSI_OPCODE_TXL2) { coords[3] = coords_arg[4]; + } else if (opcode == TGSI_OPCODE_TXB || + opcode == TGSI_OPCODE_TXL) { + coords[3] = coords_arg[3]; } } -- 1.8.2.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 3/4] r600g/llvm: Fix texelFetchOffset-2D
--- src/gallium/drivers/r600/r600_llvm.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c index 3d2c492..c1809b3 100644 --- a/src/gallium/drivers/r600/r600_llvm.c +++ b/src/gallium/drivers/r600/r600_llvm.c @@ -460,6 +460,12 @@ static void llvm_emit_tex( for (c = 1; c emit_data-arg_count; ++c) args[c] = emit_data-args[c]; + if (emit_data-inst-Instruction.Opcode == TGSI_OPCODE_TXF) { + args[1] = LLVMBuildShl(gallivm-builder, args[1], lp_build_const_int32(gallivm, 1), ); + args[2] = LLVMBuildShl(gallivm-builder, args[2], lp_build_const_int32(gallivm, 1), ); + args[3] = LLVMBuildShl(gallivm-builder, args[3], lp_build_const_int32(gallivm, 1), ); + } + sampler_src = emit_data-inst-Instruction.NumSrcRegs-1; args[c++] = lp_build_const_int32(gallivm, -- 1.8.2.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] R600/SI Patches: A few cleanups for compute
Hi, -- next part -- From dc547a89dac5039ce521f3c27fb23346251d488d Mon Sep 17 00:00:00 2001 From: Tom Stellard thomas.stellard at amd.com Date: Tue, 7 May 2013 16:26:26 -0400 Subject: [PATCH 4/7] R600: Swap the legality of rotl and rotr The hardware supports rotr and not rotl. --- lib/Target/R600/AMDGPUISelLowering.cpp | 3 +++ lib/Target/R600/AMDGPUISelLowering.h | 1 - lib/Target/R600/AMDGPUInstrInfo.td | 6 -- lib/Target/R600/AMDGPUInstructions.td | 6 ++ lib/Target/R600/AMDILISelLowering.cpp | 2 -- lib/Target/R600/R600ISelLowering.cpp | 15 --- lib/Target/R600/R600Instructions.td| 6 ++ test/CodeGen/R600/rotr.ll | 29 + 8 files changed, 40 insertions(+), 28 deletions(-) create mode 100644 test/CodeGen/R600/rotr.ll diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index a266df5..b3c51e3 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -46,6 +46,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine TM) : setOperationAction(ISD::FFLOOR, MVT::f32, Legal); setOperationAction(ISD::FRINT, MVT::f32, Legal); + // The hardware supports ROTR, but not ROTL + setOperationAction(ISD::ROTL, MVT::i32, Expand); + // Lower floating point store/load to integer store/load to reduce the number // of patterns in tablegen. setOperationAction(ISD::STORE, MVT::f32, Promote); diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index c2a79ea..6f8ab8b 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -115,7 +115,6 @@ enum { RET_FLAG, BRANCH_COND, // End AMDIL ISD Opcodes - BITALIGN, BUFFER_STORE, DWORDADDR, FRACT, diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td index b66ae87..a0a3410 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.td +++ b/lib/Target/R600/AMDGPUInstrInfo.td @@ -23,12 +23,6 @@ def AMDGPUDTIntTernaryOp : SDTypeProfile1, 3, [ // AMDGPU DAG Nodes // -// out = ((a 32) | b) c) -// -// Can be used to optimize rtol: -// rotl(a, b) = bitalign(a, a, 32 - b) -def AMDGPUbitalign : SDNodeAMDGPUISD::BITALIGN, AMDGPUDTIntTernaryOp; - // This argument to this node is a dword address. def AMDGPUdwordaddr : SDNodeAMDGPUISD::DWORDADDR, SDTIntUnaryOp; diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td index d2620b2..54df7d0 100644 --- a/lib/Target/R600/AMDGPUInstructions.td +++ b/lib/Target/R600/AMDGPUInstructions.td @@ -295,6 +295,12 @@ class BFEPattern Instruction BFE : Pat (BFE $x, $y, $z) ; +// rotr pattern +class ROTRPattern Instruction BIT_ALIGN : Pat + (rotr i32:$src0, i32:$src1), + (BIT_ALIGN $src0, $src0, $src1) +; + include R600Instructions.td include SIInstrInfo.td diff --git a/lib/Target/R600/AMDILISelLowering.cpp b/lib/Target/R600/AMDILISelLowering.cpp index 922cac1..e20dbe0 100644 --- a/lib/Target/R600/AMDILISelLowering.cpp +++ b/lib/Target/R600/AMDILISelLowering.cpp @@ -138,8 +138,6 @@ void AMDGPUTargetLowering::InitAMDILLowering() { setOperationAction(ISD::SMUL_LOHI, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); -// GPU doesn't have a rotl, rotr, or byteswap instruction -setOperationAction(ISD::ROTR, VT, Expand); setOperationAction(ISD::BSWAP, VT, Expand); // GPU doesn't have any counting operators diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index 7252235..e58a8dd 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -72,8 +72,6 @@ R600TargetLowering::R600TargetLowering(TargetMachine TM) : setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom); - setOperationAction(ISD::ROTL, MVT::i32, Custom); - setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); @@ -327,7 +325,6 @@ using namespace llvm::AMDGPUIntrinsic; SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG DAG) const { switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); - case ISD::ROTL: return LowerROTL(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); @@ -518,18 +515,6 @@ SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG DAG) const return DAG.getConstant(Offset * 4 * TFL-getStackWidth(MF), MVT::i32); } -SDValue R600TargetLowering::LowerROTL(SDValue Op, SelectionDAG DAG) const { - DebugLoc DL = Op.getDebugLoc(); - EVT VT = Op.getValueType(); - - return DAG.getNode(AMDGPUISD::BITALIGN, DL, VT, - Op.getOperand(0), -
Re: [Mesa-dev] [PATCH 2/2] R600: Fix encoding for R600 family GPUs
Thank for fixing this ! Both patches are reviewed-by: vljn at ovi.com - Mail original - De : Tom Stellard t...@stellard.net À : llvm-comm...@cs.uiuc.edu Cc : mesa-dev@lists.freedesktop.org; Tom Stellard thomas.stell...@amd.com Envoyé le : Mercredi 15 mai 2013 1h03 Objet : [Mesa-dev] [PATCH 2/2] R600: Fix encoding for R600 family GPUs From: Tom Stellard thomas.stell...@amd.com https://bugs.freedesktop.org/show_bug.cgi?id=64193 https://bugs.freedesktop.org/show_bug.cgi?id=64257 https://bugs.freedesktop.org/show_bug.cgi?id=64320 NOTE: This is a candidate for the 3.3 branch. --- lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp | 7 +++ test/CodeGen/R600/r600-encoding.ll | 24 ++ 2 files changed, 31 insertions(+) create mode 100644 test/CodeGen/R600/r600-encoding.ll diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp index c5bd01a..cb4cf0c 100644 --- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp @@ -179,6 +179,13 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS, Emit((u_int32_t) 0, OS); } else { uint64_t Inst = getBinaryCodeForInstr(MI, Fixups); + if ((STI.getFeatureBits() AMDGPU::FeatureR600ALUInst) + ((Desc.TSFlags R600_InstFlag::OP1) || + Desc.TSFlags R600_InstFlag::OP2)) { + uint64_t ISAOpCode = Inst (0x3FFULL 39); + Inst = ~(0x3FFULL 39); + Inst |= ISAOpCode 1; + } Emit(Inst, OS); } } diff --git a/test/CodeGen/R600/r600-encoding.ll b/test/CodeGen/R600/r600-encoding.ll new file mode 100644 index 000..c8040a1 --- /dev/null +++ b/test/CodeGen/R600/r600-encoding.ll @@ -0,0 +1,24 @@ +; RUN: llc %s -march=r600 -show-mc-encoding -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s +; RUN: llc %s -march=r600 -show-mc-encoding -mcpu=rs880 | FileCheck --check-prefix=R600-CHECK %s + +; The earliest R600 GPUs have a slightly different encoding than the rest of +; the VLIW4/5 GPUs. + +; EG-CHECK: @test +; EG-CHECK: MUL_IEEE {{[ *TXYZW.,0-9]+}} ; encoding: [{{0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x10,0x01,0x[0-9a-f]+,0x[0-9a-f]+}}] + +; R600-CHECK: @test +; R600-CHECK: MUL_IEEE {{[ *TXYZW.,0-9]+}} ; encoding: [{{0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x10,0x02,0x[0-9a-f]+,0x[0-9a-f]+}}] + +define void @test() { +entry: + %0 = call float @llvm.R600.load.input(i32 0) + %1 = call float @llvm.R600.load.input(i32 1) + %2 = fmul float %0, %1 + call void @llvm.AMDGPU.store.output(float %2, i32 0) + ret void +} + +declare float @llvm.R600.load.input(i32) readnone + +declare void @llvm.AMDGPU.store.output(float, i32) -- 1.7.11.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] R600 Patchset: Emit true ISA
Reviewed-by:Vincent Lejeunevljn at ovi.com - Mail original - De : Tom Stellard t...@stellard.net À : Vincent Lejeune v...@ovi.com Cc : llvm-comm...@cs.uiuc.edu llvm-comm...@cs.uiuc.edu; mesa-dev@lists.freedesktop.org mesa-dev@lists.freedesktop.org Envoyé le : Lundi 6 mai 2013 17h02 Objet : Re: R600 Patchset: Emit true ISA On Sat, May 04, 2013 at 09:09:25AM -0700, Vincent Lejeune wrote: Hi, Thank for doing this. Patches 1 2 and 3 have my rb. For patch 4: Hi Vincent, Attached is an updated version of patch 4. -Tom @@ -125,9 +106,7 @@ MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo MCII, void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS, SmallVectorImplMCFixup Fixups) const { - if (isFCOp(MI.getOpcode())){ - EmitFCInstr(MI, OS); - } else if (MI.getOpcode() == AMDGPU::RETURN || + if (MI.getOpcode() == AMDGPU::RETURN || MI.getOpcode() == AMDGPU::FETCH_CLAUSE || MI.getOpcode() == AMDGPU::ALU_CLAUSE || MI.getOpcode() == AMDGPU::BUNDLE || @@ -135,12 +114,6 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS, return; } else { switch(MI.getOpcode()) { - case AMDGPU::RAT_WRITE_CACHELESS_32_eg: - case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { - uint64_t inst = getBinaryCodeForInstr(MI, Fixups); - Emit(inst, OS); - break; - } case AMDGPU::CONSTANT_LOAD_eg: case AMDGPU::VTX_READ_PARAM_8_eg: case AMDGPU::VTX_READ_PARAM_16_eg: Is it possible to use R600_InstFlag::VTX_INST and R600_InstFlag::TEX_INST instead and to remove the switch() statement ? @@ -234,44 +207,6 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS, Emit((u_int32_t) 0, OS); break; } - Mail original - De : Tom Stellard t...@stellard.net À : llvm-comm...@cs.uiuc.edu Cc : mesa-dev@lists.freedesktop.org Envoyé le : Samedi 4 mai 2013 0h53 Objet : R600 Patchset: Emit true ISA Hi, The attached patches modify the CodeEmitter to emit true ISA. Previously, we were prefixing all instructions with an instruction type byte. Vincent did most of the work to convert the CodeEmitter to true ISA, these patches are just the last few cleanups that are needed to finish the project. Please test/review. Thanks, Tom ___ llvm-commits mailing list llvm-comm...@cs.uiuc.edu http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] R600 Patchset: Emit true ISA
Hi, Thank for doing this. Patches 1 2 and 3 have my rb. For patch 4: @@ -125,9 +106,7 @@ MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo MCII, void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS, SmallVectorImplMCFixup Fixups) const { - if (isFCOp(MI.getOpcode())){ -EmitFCInstr(MI, OS); - } else if (MI.getOpcode() == AMDGPU::RETURN || + if (MI.getOpcode() == AMDGPU::RETURN || MI.getOpcode() == AMDGPU::FETCH_CLAUSE || MI.getOpcode() == AMDGPU::ALU_CLAUSE || MI.getOpcode() == AMDGPU::BUNDLE || @@ -135,12 +114,6 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS, return; } else { switch(MI.getOpcode()) { -case AMDGPU::RAT_WRITE_CACHELESS_32_eg: -case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { - uint64_t inst = getBinaryCodeForInstr(MI, Fixups); - Emit(inst, OS); - break; -} case AMDGPU::CONSTANT_LOAD_eg: case AMDGPU::VTX_READ_PARAM_8_eg: case AMDGPU::VTX_READ_PARAM_16_eg: Is it possible to use R600_InstFlag::VTX_INST and R600_InstFlag::TEX_INST instead and to remove the switch() statement ? @@ -234,44 +207,6 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS, Emit((u_int32_t) 0, OS); break; } - Mail original - De : Tom Stellard t...@stellard.net À : llvm-comm...@cs.uiuc.edu Cc : mesa-dev@lists.freedesktop.org Envoyé le : Samedi 4 mai 2013 0h53 Objet : R600 Patchset: Emit true ISA Hi, The attached patches modify the CodeEmitter to emit true ISA. Previously, we were prefixing all instructions with an instruction type byte. Vincent did most of the work to convert the CodeEmitter to true ISA, these patches are just the last few cleanups that are needed to finish the project. Please test/review. Thanks, Tom ___ llvm-commits mailing list llvm-comm...@cs.uiuc.edu http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] r600g/llvm: Undefines unrequired texture coord values
This is a port of r600g:mask unused source components for SAMPLE patch from Vadim Girlin. --- src/gallium/drivers/r600/r600_llvm.c | 25 - 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c index 83d7340..a94faf2 100644 --- a/src/gallium/drivers/r600/r600_llvm.c +++ b/src/gallium/drivers/r600/r600_llvm.c @@ -429,9 +429,32 @@ static void llvm_emit_tex( } } + if (emit_data-inst-Instruction.Opcode == TGSI_OPCODE_TEX) { + LLVMValueRef Vector[4] = { + LLVMBuildExtractElement(gallivm-builder, emit_data-args[0], lp_build_const_int32(gallivm, 0), ), + LLVMBuildExtractElement(gallivm-builder, emit_data-args[0], lp_build_const_int32(gallivm, 1), ), + LLVMBuildExtractElement(gallivm-builder, emit_data-args[0], lp_build_const_int32(gallivm, 2), ), + LLVMBuildExtractElement(gallivm-builder, emit_data-args[0], lp_build_const_int32(gallivm, 3), ), + }; + switch (emit_data-inst-Texture.Texture) { + case TGSI_TEXTURE_2D: + case TGSI_TEXTURE_RECT: + Vector[2] = Vector[3] = LLVMGetUndef(bld_base-base.elem_type); + break; + case TGSI_TEXTURE_1D: + Vector[1] = Vector[2] = Vector[3] = LLVMGetUndef(bld_base-base.elem_type); + break; + default: + break; + } + args[0] = lp_build_gather_values(gallivm, Vector, 4); + } else { + args[0] = emit_data-args[0]; + } + assert(emit_data-arg_count + 2 = Elements(args)); - for (c = 0; c emit_data-arg_count; ++c) + for (c = 1; c emit_data-arg_count; ++c) args[c] = emit_data-args[c]; sampler_src = emit_data-inst-Instruction.NumSrcRegs-1; -- 1.8.1.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] r600/llvm: use killgt info from llvm
--- src/gallium/drivers/r600/r600_llvm.c | 2 ++ src/gallium/drivers/r600/r600_llvm.h | 1 + src/gallium/drivers/r600/r600_shader.c | 8 ++-- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c index 2050be2..83d7340 100644 --- a/src/gallium/drivers/r600/r600_llvm.c +++ b/src/gallium/drivers/r600/r600_llvm.c @@ -556,6 +556,7 @@ unsigned r600_llvm_compile( unsigned * inst_byte_count, enum radeon_family family, struct r600_bytecode *bc, + boolean *use_kill, unsigned dump) { unsigned r; @@ -566,6 +567,7 @@ unsigned r600_llvm_compile( *inst_byte_count = binary.code_size; bc-ngpr = util_le32_to_cpu(*(uint32_t*)binary.config); bc-nstack = util_le32_to_cpu(*(uint32_t*)(binary.config + 4)); + *use_kill = util_le32_to_cpu(*(uint32_t*)(binary.config + 8)); return r; } diff --git a/src/gallium/drivers/r600/r600_llvm.h b/src/gallium/drivers/r600/r600_llvm.h index 919dd24..50bbca6 100644 --- a/src/gallium/drivers/r600/r600_llvm.h +++ b/src/gallium/drivers/r600/r600_llvm.h @@ -22,6 +22,7 @@ unsigned r600_llvm_compile( unsigned * inst_byte_count, enum radeon_family family, struct r600_bytecode *bc, + boolean *use_kill, unsigned dump); #endif /* defined R600_USE_LLVM || defined HAVE_OPENCL */ diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 0204f80..25f900f 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -272,6 +272,7 @@ int r600_compute_shader_create(struct pipe_context * ctx, unsigned byte_count; struct r600_shader_ctx shader_ctx; bool dump = (r600_ctx-screen-debug_flags DBG_CS) != 0; + boolean use_kill; shader_ctx.bc = bytecode; r600_bytecode_init(shader_ctx.bc, r600_ctx-chip_class, r600_ctx-family, @@ -279,7 +280,7 @@ int r600_compute_shader_create(struct pipe_context * ctx, shader_ctx.bc-type = TGSI_PROCESSOR_COMPUTE; shader_ctx.bc-isa = r600_ctx-isa; r600_llvm_compile(mod, bytes, byte_count, r600_ctx-family, - shader_ctx.bc, dump); + shader_ctx.bc, use_kill, dump); r600_bytecode_from_byte_stream(shader_ctx, bytes, byte_count); if (shader_ctx.bc-chip_class == CAYMAN) { cm_bytecode_add_cf_end(shader_ctx.bc); @@ -1444,6 +1445,7 @@ static int r600_shader_from_tgsi(struct r600_screen *rscreen, if (use_llvm) { struct radeon_llvm_context radeon_llvm_ctx; LLVMModuleRef mod; + boolean use_kill; bool dump = r600_can_dump_shader(rscreen, ctx.type); memset(radeon_llvm_ctx, 0, sizeof(radeon_llvm_ctx)); @@ -1461,13 +1463,15 @@ static int r600_shader_from_tgsi(struct r600_screen *rscreen, mod = r600_tgsi_llvm(radeon_llvm_ctx, tokens); if (r600_llvm_compile(mod, inst_bytes, inst_byte_count, - rscreen-family, ctx.bc, dump)) { + rscreen-family, ctx.bc, use_kill, dump)) { FREE(inst_bytes); radeon_llvm_dispose(radeon_llvm_ctx); use_llvm = 0; fprintf(stderr, R600 LLVM backend failed to compile shader. Falling back to TGSI\n); } else { + if (use_kill) + ctx.shader-uses_kill = use_kill; ctx.file_offset[TGSI_FILE_OUTPUT] = ctx.file_offset[TGSI_FILE_INPUT]; } -- 1.8.1.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/3] R600: Emit used GPRs count
--- lib/Target/R600/AMDGPUAsmPrinter.cpp | 35 +-- lib/Target/R600/AMDGPUAsmPrinter.h | 3 ++- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp index dacb033..580cfb4 100644 --- a/lib/Target/R600/AMDGPUAsmPrinter.cpp +++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp @@ -21,6 +21,7 @@ #include AMDGPU.h #include SIMachineFunctionInfo.h #include SIRegisterInfo.h +#include R600RegisterInfo.h #include llvm/MC/MCStreamer.h #include llvm/Support/TargetRegistry.h #include llvm/Target/TargetLoweringObjectFile.h @@ -52,13 +53,43 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction MF) { } OutStreamer.SwitchSection(getObjFileLowering().getTextSection()); if (STM.device()-getGeneration() AMDGPUDeviceInfo::HD6XXX) { -EmitProgramInfo(MF); +EmitProgramInfoSI(MF); + } else { +EmitProgramInfoR600(MF); } EmitFunctionBody(); return false; } -void AMDGPUAsmPrinter::EmitProgramInfo(MachineFunction MF) { +void AMDGPUAsmPrinter::EmitProgramInfoR600(MachineFunction MF) { + unsigned MaxGPR = 0; + const R600RegisterInfo * RI = +static_castconst R600RegisterInfo*(TM.getRegisterInfo()); + + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { +MachineBasicBlock MBB = *BB; +for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); +I != E; ++I) { + MachineInstr MI = *I; + unsigned numOperands = MI.getNumOperands(); + for (unsigned op_idx = 0; op_idx numOperands; op_idx++) { +MachineOperand MO = MI.getOperand(op_idx); +if (!MO.isReg()) + continue; +unsigned HWReg = RI-getEncodingValue(MO.getReg()) 0xff; + +// Register with value 127 aren't GPR +if (HWReg 127) + continue; +MaxGPR = std::max(MaxGPR, HWReg); + } +} + } + OutStreamer.EmitIntValue(MaxGPR + 1, 4); +} + +void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction MF) { unsigned MaxSGPR = 0; unsigned MaxVGPR = 0; bool VCCUsed = false; diff --git a/lib/Target/R600/AMDGPUAsmPrinter.h b/lib/Target/R600/AMDGPUAsmPrinter.h index 3812282..f425ef4 100644 --- a/lib/Target/R600/AMDGPUAsmPrinter.h +++ b/lib/Target/R600/AMDGPUAsmPrinter.h @@ -33,7 +33,8 @@ public: /// \brief Emit register usage information so that the GPU driver /// can correctly setup the GPU state. - void EmitProgramInfo(MachineFunction MF); + void EmitProgramInfoR600(MachineFunction MF); + void EmitProgramInfoSI(MachineFunction MF); /// Implemented in AMDGPUMCInstLower.cpp virtual void EmitInstruction(const MachineInstr *MI); -- 1.8.1.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/3] R600: Export is emitted as a CF_NATIVE inst
--- lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp | 15 +-- lib/Target/R600/R600Instructions.td| 8 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp index 469a8ad..416d710 100644 --- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp @@ -250,15 +250,6 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS, Emit(Word2, OS); break; } -case AMDGPU::EG_ExportSwz: -case AMDGPU::R600_ExportSwz: -case AMDGPU::EG_ExportBuf: -case AMDGPU::R600_ExportBuf: { - uint64_t Inst = getBinaryCodeForInstr(MI, Fixups); - EmitByte(INSTR_EXPORT, OS); - Emit(Inst, OS); - break; -} case AMDGPU::CF_ALU: case AMDGPU::CF_ALU_PUSH_BEFORE: { uint64_t Inst = getBinaryCodeForInstr(MI, Fixups); @@ -286,7 +277,11 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS, case AMDGPU::CF_CONTINUE_R600: case AMDGPU::CF_JUMP_R600: case AMDGPU::CF_ELSE_R600: -case AMDGPU::POP_R600: { +case AMDGPU::POP_R600: +case AMDGPU::EG_ExportSwz: +case AMDGPU::R600_ExportSwz: +case AMDGPU::EG_ExportBuf: +case AMDGPU::R600_ExportBuf: { uint64_t Inst = getBinaryCodeForInstr(MI, Fixups); EmitByte(INSTR_NATIVE, OS); Emit(Inst, OS); diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index b4c45e1..2e9a8a3 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -1423,7 +1423,7 @@ let Predicates = [isR600] in { (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_r600 R600_Reg32:$src)); def R600_ExportSwz : ExportSwzInst { -let Word1{20-17} = 1; // BURST_COUNT +let Word1{20-17} = 0; // BURST_COUNT let Word1{21} = eop; let Word1{22} = 1; // VALID_PIXEL_MODE let Word1{30-23} = inst; @@ -1432,7 +1432,7 @@ let Predicates = [isR600] in { defm : ExportPatternR600_ExportSwz, 39; def R600_ExportBuf : ExportBufInst { -let Word1{20-17} = 1; // BURST_COUNT +let Word1{20-17} = 0; // BURST_COUNT let Word1{21} = eop; let Word1{22} = 1; // VALID_PIXEL_MODE let Word1{30-23} = inst; @@ -1622,7 +1622,7 @@ let hasSideEffects = 1 in { (FLT_TO_UINT_eg (TRUNC R600_Reg32:$src0)); def EG_ExportSwz : ExportSwzInst { -let Word1{19-16} = 1; // BURST_COUNT +let Word1{19-16} = 0; // BURST_COUNT let Word1{20} = 1; // VALID_PIXEL_MODE let Word1{21} = eop; let Word1{29-22} = inst; @@ -1632,7 +1632,7 @@ let hasSideEffects = 1 in { defm : ExportPatternEG_ExportSwz, 83; def EG_ExportBuf : ExportBufInst { -let Word1{19-16} = 1; // BURST_COUNT +let Word1{19-16} = 0; // BURST_COUNT let Word1{20} = 1; // VALID_PIXEL_MODE let Word1{21} = eop; let Word1{29-22} = inst; -- 1.8.1.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 3/3] R600: Make Export Instruction not duplicable
--- lib/Target/R600/R600Instructions.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index 2e9a8a3..1c292bb 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -738,7 +738,7 @@ multiclass SteamOutputExportPatternInstruction ExportInst, 4095, imm:$mask, buf3inst, 0); } -let usesCustomInserter = 1 in { +let usesCustomInserter = 1, isNotDuplicable = 1 in { class ExportSwzInst : InstR600ISA( outs), -- 1.8.1.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] R600: Add VTX_READ_* and RAT_WRITE_CACHELESS_* when computing cf addr
--- lib/Target/R600/R600ControlFlowFinalizer.cpp | 11 ++- test/CodeGen/R600/loop-adress.ll | 44 2 files changed, 54 insertions(+), 1 deletion(-) create mode 100644 test/CodeGen/R600/loop-adress.ll diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp index cfaa36e..2350130 100644 --- a/lib/Target/R600/R600ControlFlowFinalizer.cpp +++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp @@ -67,6 +67,13 @@ private: case AMDGPU::TEX_SAMPLE_C_G: case AMDGPU::TXD: case AMDGPU::TXD_SHADOW: +case AMDGPU::VTX_READ_GLOBAL_8_eg: +case AMDGPU::VTX_READ_GLOBAL_32_eg: +case AMDGPU::VTX_READ_GLOBAL_128_eg: +case AMDGPU::VTX_READ_PARAM_8_eg: +case AMDGPU::VTX_READ_PARAM_16_eg: +case AMDGPU::VTX_READ_PARAM_32_eg: +case AMDGPU::VTX_READ_PARAM_128_eg: return true; default: return false; @@ -207,6 +214,8 @@ public: case AMDGPU::EG_ExportSwz: case AMDGPU::R600_ExportBuf: case AMDGPU::R600_ExportSwz: +case AMDGPU::RAT_WRITE_CACHELESS_32_eg: +case AMDGPU::RAT_WRITE_CACHELESS_128_eg: DEBUG(dbgs() CfCount :; MI-dump();); CfCount++; break; @@ -215,7 +224,7 @@ public: MaxStack = std::max(MaxStack, CurrentStack); MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_WHILE_LOOP)) - .addImm(2); + .addImm(1); std::pairunsigned, std::setMachineInstr * Pair(CfCount, std::setMachineInstr *()); Pair.second.insert(MIb); diff --git a/test/CodeGen/R600/loop-adress.ll b/test/CodeGen/R600/loop-adress.ll new file mode 100644 index 000..dc9295e --- /dev/null +++ b/test/CodeGen/R600/loop-adress.ll @@ -0,0 +1,44 @@ +;RUN: llc %s -march=r600 -mcpu=redwood | FileCheck %s + +;CHECK: TEX +;CHECK: ALU_PUSH +;CHECK: JUMP @4 +;CHECK: ELSE @16 +;CHECK: TEX +;CHECK: LOOP_START_DX10 @15 +;CHECK: LOOP_BREAK @14 +;CHECK: POP @16 + +target datalayout = e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64 +target triple = r600-- + +define void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) #0 { +entry: + %cmp5 = icmp sgt i32 %iterations, 0 + br i1 %cmp5, label %for.body, label %for.end + +for.body: ; preds = %for.body, %entry + %i.07.in = phi i32 [ %i.07, %for.body ], [ %iterations, %entry ] + %ai.06 = phi i32 [ %add, %for.body ], [ 0, %entry ] + %i.07 = add nsw i32 %i.07.in, -1 + %arrayidx = getelementptr inbounds i32 addrspace(1)* %out, i32 %ai.06 + store i32 %i.07, i32 addrspace(1)* %arrayidx, align 4, !tbaa !4 + %add = add nsw i32 %ai.06, 1 + %exitcond = icmp eq i32 %add, %iterations + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +attributes #0 = { nounwind fp-contract-model=standard relocation-model=pic ssp-buffers-size=8 } + +!opencl.kernels = !{!0, !1, !2, !3} + +!0 = metadata !{void (i32 addrspace(1)*, i32)* @loop_ge} +!1 = metadata !{null} +!2 = metadata !{null} +!3 = metadata !{null} +!4 = metadata !{metadata !int, metadata !5} +!5 = metadata !{metadata !omnipotent char, metadata !6} +!6 = metadata !{metadata !Simple C/C++ TBAA} -- 1.8.1.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] R600: Control Flow support for pre EG gen
--- lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp | 30 ++-- lib/Target/R600/R600ControlFlowFinalizer.cpp | 84 +++-- lib/Target/R600/R600Instructions.td| 198 +++-- 3 files changed, 240 insertions(+), 72 deletions(-) diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp index 927bcbd..469a8ad 100644 --- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp @@ -266,17 +266,27 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS, Emit(Inst, OS); break; } -case AMDGPU::CF_TC: -case AMDGPU::CF_VC: -case AMDGPU::CF_CALL_FS: +case AMDGPU::CF_TC_EG: +case AMDGPU::CF_VC_EG: +case AMDGPU::CF_CALL_FS_EG: +case AMDGPU::CF_TC_R600: +case AMDGPU::CF_VC_R600: +case AMDGPU::CF_CALL_FS_R600: return; -case AMDGPU::WHILE_LOOP: -case AMDGPU::END_LOOP: -case AMDGPU::LOOP_BREAK: -case AMDGPU::CF_CONTINUE: -case AMDGPU::CF_JUMP: -case AMDGPU::CF_ELSE: -case AMDGPU::POP: { +case AMDGPU::WHILE_LOOP_EG: +case AMDGPU::END_LOOP_EG: +case AMDGPU::LOOP_BREAK_EG: +case AMDGPU::CF_CONTINUE_EG: +case AMDGPU::CF_JUMP_EG: +case AMDGPU::CF_ELSE_EG: +case AMDGPU::POP_EG: +case AMDGPU::WHILE_LOOP_R600: +case AMDGPU::END_LOOP_R600: +case AMDGPU::LOOP_BREAK_R600: +case AMDGPU::CF_CONTINUE_R600: +case AMDGPU::CF_JUMP_R600: +case AMDGPU::CF_ELSE_R600: +case AMDGPU::POP_R600: { uint64_t Inst = getBinaryCodeForInstr(MI, Fixups); EmitByte(INSTR_NATIVE, OS); Emit(Inst, OS); diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp index 3a6c7ea..cfaa36e 100644 --- a/lib/Target/R600/R600ControlFlowFinalizer.cpp +++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp @@ -30,9 +30,22 @@ namespace llvm { class R600ControlFlowFinalizer : public MachineFunctionPass { private: + enum ControlFlowInstruction { +CF_TC, +CF_CALL_FS, +CF_WHILE_LOOP, +CF_END_LOOP, +CF_LOOP_BREAK, +CF_LOOP_CONTINUE, +CF_JUMP, +CF_ELSE, +CF_POP + }; + static char ID; const R600InstrInfo *TII; unsigned MaxFetchInst; + const AMDGPUSubtarget ST; bool isFetch(const MachineInstr *MI) const { switch (MI-getOpcode()) { @@ -70,6 +83,52 @@ private: } } + const MCInstrDesc getHWInstrDesc(ControlFlowInstruction CFI) const { +if (ST.device()-getGeneration() = AMDGPUDeviceInfo::HD4XXX) { + switch (CFI) { + case CF_TC: +return TII-get(AMDGPU::CF_TC_R600); + case CF_CALL_FS: +return TII-get(AMDGPU::CF_CALL_FS_R600); + case CF_WHILE_LOOP: +return TII-get(AMDGPU::WHILE_LOOP_R600); + case CF_END_LOOP: +return TII-get(AMDGPU::END_LOOP_R600); + case CF_LOOP_BREAK: +return TII-get(AMDGPU::LOOP_BREAK_R600); + case CF_LOOP_CONTINUE: +return TII-get(AMDGPU::CF_CONTINUE_R600); + case CF_JUMP: +return TII-get(AMDGPU::CF_JUMP_R600); + case CF_ELSE: +return TII-get(AMDGPU::CF_ELSE_R600); + case CF_POP: +return TII-get(AMDGPU::POP_R600); + } +} else { + switch (CFI) { + case CF_TC: +return TII-get(AMDGPU::CF_TC_EG); + case CF_CALL_FS: +return TII-get(AMDGPU::CF_CALL_FS_EG); + case CF_WHILE_LOOP: +return TII-get(AMDGPU::WHILE_LOOP_EG); + case CF_END_LOOP: +return TII-get(AMDGPU::END_LOOP_EG); + case CF_LOOP_BREAK: +return TII-get(AMDGPU::LOOP_BREAK_EG); + case CF_LOOP_CONTINUE: +return TII-get(AMDGPU::CF_CONTINUE_EG); + case CF_JUMP: +return TII-get(AMDGPU::CF_JUMP_EG); + case CF_ELSE: +return TII-get(AMDGPU::CF_ELSE_EG); + case CF_POP: +return TII-get(AMDGPU::POP_EG); + } +} + } + MachineBasicBlock::iterator MakeFetchClause(MachineBasicBlock MBB, MachineBasicBlock::iterator I, unsigned CfAddress) const { @@ -85,7 +144,7 @@ private: break; } BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), -TII-get(AMDGPU::CF_TC)) +getHWInstrDesc(CF_TC)) .addImm(CfAddress) // ADDR .addImm(AluInstCount); // COUNT return I; @@ -104,7 +163,8 @@ private: public: R600ControlFlowFinalizer(TargetMachine tm) : MachineFunctionPass(ID), -TII (static_castconst R600InstrInfo *(tm.getInstrInfo())) { +TII (static_castconst R600InstrInfo *(tm.getInstrInfo())), +ST(tm.getSubtargetAMDGPUSubtarget()) { const AMDGPUSubtarget ST = tm.getSubtargetAMDGPUSubtarget(); if (ST.device()-getGeneration() = AMDGPUDeviceInfo::HD4XXX) MaxFetchInst = 8; @@ -124,7 +184,7 @@ public: R600MachineFunctionInfo *MFI = MF.getInfoR600MachineFunctionInfo(); if (MFI-ShaderType == 1) { BuildMI(MBB,
[Mesa-dev] [PATCH] r600g/llvm: Add support for native isa for pre EG
This fixes bug 62756 : https://bugs.freedesktop.org/show_bug.cgi?id=62756#c12 (Requires corresponding llvm commit) --- src/gallium/drivers/r600/r600_asm.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c index a0dc1de..26a848a 100644 --- a/src/gallium/drivers/r600/r600_asm.c +++ b/src/gallium/drivers/r600/r600_asm.c @@ -1494,7 +1494,11 @@ static int r600_bytecode_cf_build(struct r600_bytecode *bc, struct r600_bytecode const struct cf_op_info *cfop = r600_isa_cf(cf-op); unsigned opcode = r600_isa_cf_opcode(bc-isa-hw_class, cf-op); - if (cfop-flags CF_ALU) { + + if (cf-op == CF_NATIVE) { + bc-bytecode[id++] = cf-isa[0]; + bc-bytecode[id++] = cf-isa[1]; + } else if (cfop-flags CF_ALU) { bc-bytecode[id++] = S_SQ_CF_ALU_WORD0_ADDR(cf-addr 1) | S_SQ_CF_ALU_WORD0_KCACHE_MODE0(cf-kcache[0].mode) | S_SQ_CF_ALU_WORD0_KCACHE_BANK0(cf-kcache[0].bank) | -- 1.8.1.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] R600: Add support for native control flow
--- lib/Target/R600/AMDGPU.h | 1 + lib/Target/R600/AMDGPUTargetMachine.cpp| 1 + lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp | 21 +- lib/Target/R600/R600ControlFlowFinalizer.cpp | 264 + lib/Target/R600/R600Instructions.td| 100 .../CodeGen/R600/disconnected-predset-break-bug.ll | 5 +- test/CodeGen/R600/predicates.ll| 12 +- 7 files changed, 397 insertions(+), 7 deletions(-) create mode 100644 lib/Target/R600/R600ControlFlowFinalizer.cpp diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h index 3cd792a..0b01433 100644 --- a/lib/Target/R600/AMDGPU.h +++ b/lib/Target/R600/AMDGPU.h @@ -24,6 +24,7 @@ class AMDGPUTargetMachine; FunctionPass* createR600KernelParametersPass(const DataLayout *TD); FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine tm); FunctionPass *createR600EmitClauseMarkers(TargetMachine tm); +FunctionPass *createR600ControlFlowFinalizer(TargetMachine tm); // SI Passes FunctionPass *createSIAnnotateControlFlowPass(); diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp index 45b1be0..e7ea876 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.cpp +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp @@ -153,6 +153,7 @@ bool AMDGPUPassConfig::addPreEmitPass() { addPass(createAMDGPUCFGStructurizerPass(*TM)); addPass(createR600EmitClauseMarkers(*TM)); addPass(createR600ExpandSpecialInstrsPass(*TM)); +addPass(createR600ControlFlowFinalizer(*TM)); addPass(FinalizeMachineBundlesID); } else { addPass(createSILowerControlFlowPass(*TM)); diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp index 1bf87fc..6ef4d40 100644 --- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp @@ -147,6 +147,10 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS, return; } else { switch(MI.getOpcode()) { +case AMDGPU::STACK_SIZE: { + EmitByte(MI.getOperand(0).getImm(), OS); + break; +} case AMDGPU::RAT_WRITE_CACHELESS_32_eg: case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { uint64_t inst = getBinaryCodeForInstr(MI, Fixups); @@ -259,7 +263,22 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS, Emit(Inst, OS); break; } - +case AMDGPU::CF_TC: +case AMDGPU::CF_VC: +case AMDGPU::CF_CALL_FS: + return; +case AMDGPU::WHILE_LOOP: +case AMDGPU::END_LOOP: +case AMDGPU::LOOP_BREAK: +case AMDGPU::CF_CONTINUE: +case AMDGPU::CF_JUMP: +case AMDGPU::CF_ELSE: +case AMDGPU::POP: { + uint64_t Inst = getBinaryCodeForInstr(MI, Fixups); + EmitByte(INSTR_NATIVE, OS); + Emit(Inst, OS); + break; +} default: EmitALUInstr(MI, Fixups, OS); break; diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp new file mode 100644 index 000..bd87d74 --- /dev/null +++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp @@ -0,0 +1,264 @@ +//===-- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===--===// +// +/// \file +/// This pass compute turns all control flow pseudo instructions into native one +/// computing their address on the fly ; it also sets STACK_SIZE info. +//===--===// + +#include AMDGPU.h +#include R600Defines.h +#include R600InstrInfo.h +#include R600MachineFunctionInfo.h +#include R600RegisterInfo.h +#include llvm/CodeGen/MachineFunctionPass.h +#include llvm/CodeGen/MachineInstrBuilder.h +#include llvm/CodeGen/MachineRegisterInfo.h + +namespace llvm { + +class R600ControlFlowFinalizer : public MachineFunctionPass { + +private: + static char ID; + const R600InstrInfo *TII; + unsigned MaxFetchInst; + + bool isFetch(const MachineInstr *MI) const { +switch (MI-getOpcode()) { +case AMDGPU::TEX_VTX_CONSTBUF: +case AMDGPU::TEX_VTX_TEXBUF: +case AMDGPU::TEX_LD: +case AMDGPU::TEX_GET_TEXTURE_RESINFO: +case AMDGPU::TEX_GET_GRADIENTS_H: +case AMDGPU::TEX_GET_GRADIENTS_V: +case AMDGPU::TEX_SET_GRADIENTS_H: +case AMDGPU::TEX_SET_GRADIENTS_V: +case AMDGPU::TEX_SAMPLE: +case AMDGPU::TEX_SAMPLE_C: +case AMDGPU::TEX_SAMPLE_L: +case AMDGPU::TEX_SAMPLE_C_L: +case AMDGPU::TEX_SAMPLE_LB: +case AMDGPU::TEX_SAMPLE_C_LB: +case AMDGPU::TEX_SAMPLE_G: +case AMDGPU::TEX_SAMPLE_C_G: +case AMDGPU::TXD: +case AMDGPU::TXD_SHADOW: + return true; +default: +
Re: [Mesa-dev] [PATCH] r600g: don't reserve more stack space than required v4
Btw where can I find some more info on stack_size ? I assumed it should represent the amout of max stacked exec_mask, but it looks like it is possible to have much more manually pushed exec_mask level than reported by nstack (iiuc a push count as much as a 1/4 of a loop level). - Mail original - De : Vadim Girlin vadimgir...@gmail.com À : Vincent Lejeune v...@ovi.com Cc : Alex Deucher alexdeuc...@gmail.com; mesa-dev@lists.freedesktop.org mesa-dev@lists.freedesktop.org Envoyé le : Dimanche 31 mars 2013 22h34 Objet : Re: [Mesa-dev] [PATCH] r600g: don't reserve more stack space than required v4 On 04/01/2013 12:00 AM, Vincent Lejeune wrote: Hi Vadim, Does this patch work ? (It's still not pushed) It works for me on evergreen, but I'm not sure about other chip generations. I wanted to ask somebody to test it, but the problem is that the piglit coverage for this is not enough (e.g. initial version of this patch had no regressions with piglit but resulted in artifacts with Heaven). I thought about adding more control flow tests but haven't written them yet. The same algorithm seemingly works in my r600-sb branch with other chips, but the test coverage with that branch is even lower due to the if-conversion that eliminates most of the conditional control flow. I usually prefer not to push any patches until I'm sure that they are not breaking anything. But well, possibly in this case it's easier to simply push it and wait for the bug reports. I think I'll check if it needs rebasing and push it in a day or two if there are no objections. Vadim I'm working on doing native control flow for llvm and intend to port your patch on the control flow reservation. Vincent ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH] r600g: don't reserve more stack space than required v4
Hi Vadim, Does this patch work ? (It's still not pushed) I'm working on doing native control flow for llvm and intend to port your patch on the control flow reservation. Vincent - Mail original - De : Vadim Girlin vadimgir...@gmail.com À : Alex Deucher alexdeuc...@gmail.com Cc : mesa-dev@lists.freedesktop.org Envoyé le : Vendredi 22 février 2013 1h37 Objet : Re: [Mesa-dev] [PATCH] r600g: don't reserve more stack space than required v4 On 02/22/2013 04:23 AM, Alex Deucher wrote: On Thu, Feb 21, 2013 at 6:52 PM, Vadim Girlin vadimgir...@gmail.com wrote: v4: implement exact computation taking into account wavefront size Signed-off-by: Vadim Girlin vadimgir...@gmail.com --- src/gallium/drivers/r600/r600_asm.c | 44 +-- src/gallium/drivers/r600/r600_asm.h | 24 -- src/gallium/drivers/r600/r600_shader.c | 131 ++--- 3 files changed, 142 insertions(+), 57 deletions(-) diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c index 3632aa5..f041e27 100644 --- a/src/gallium/drivers/r600/r600_asm.c +++ b/src/gallium/drivers/r600/r600_asm.c @@ -86,6 +86,38 @@ static struct r600_bytecode_tex *r600_bytecode_tex(void) return tex; } +static unsigned stack_entry_size(enum radeon_family chip) { + /* Wavefront size: + * 64: R600/RV670/RV770/Cypress/R740/Barts/Turks/Caicos/ + * Aruba/Sumo/Sumo2/redwood/juniper + * 32: R630/R730/R710/Palm/Cedar + * 16: R610/Rs780 + * + * Stack row size: + * Wavefront Size 16 32 48 64 + * Columns per Row (R6xx/R7xx/R8xx only) 8 8 4 4 + * Columns per Row (R9xx+) 8 4 4 4 */ + + switch (chip) { + /* FIXME: are some chips missing here? */ + /* wavefront size 16 */ + case CHIP_RV610: + case CHIP_RS780: RV620 RS880 Should be 16 as well. Thanks, I'll add them. Vadim + /* wavefront size 32 */ + case CHIP_RV630: + case CHIP_RV635: + case CHIP_RV730: + case CHIP_RV710: + case CHIP_PALM: + case CHIP_CEDAR: + return 8; + + /* wavefront size 64 */ + default: + return 4; + } +} + void r600_bytecode_init(struct r600_bytecode *bc, enum chip_class chip_class, enum radeon_family family, @@ -103,6 +135,7 @@ void r600_bytecode_init(struct r600_bytecode *bc, LIST_INITHEAD(bc-cf); bc-chip_class = chip_class; bc-msaa_texture_mode = msaa_texture_mode; + bc-stack.entry_size = stack_entry_size(family); } static int r600_bytecode_add_cf(struct r600_bytecode *bc) @@ -1524,8 +1557,8 @@ int r600_bytecode_build(struct r600_bytecode *bc) unsigned addr; int i, r; - if (bc-callstack[0].max 0) - bc-nstack = ((bc-callstack[0].max + 3) 2) + 2; + bc-nstack = bc-stack.max_entries; + if (bc-type == TGSI_PROCESSOR_VERTEX !bc-nstack) { bc-nstack = 1; } @@ -1826,8 +1859,8 @@ void r600_bytecode_disasm(struct r600_bytecode *bc) chip = '6'; break; } - fprintf(stderr, bytecode %d dw -- %d gprs -\n, - bc-ndw, bc-ngpr); + fprintf(stderr, bytecode %d dw -- %d gprs -- %d nstack -\n, + bc-ndw, bc-ngpr, bc-nstack); fprintf(stderr, shader %d -- %c\n, index++, chip); LIST_FOR_EACH_ENTRY(cf, bc-cf, list) { @@ -2105,7 +2138,8 @@ void r600_bytecode_dump(struct r600_bytecode *bc) chip = '6'; break; } - fprintf(stderr, bytecode %d dw -- %d gprs -\n, bc-ndw, bc-ngpr); + fprintf(stderr, bytecode %d dw -- %d gprs -- %d nstack -\n, + bc-ndw, bc-ngpr, bc-nstack); fprintf(stderr, %c\n, chip); LIST_FOR_EACH_ENTRY(cf, bc-cf, list) { diff --git a/src/gallium/drivers/r600/r600_asm.h b/src/gallium/drivers/r600/r600_asm.h index 03cd238..5a9869d 100644 --- a/src/gallium/drivers/r600/r600_asm.h +++ b/src/gallium/drivers/r600/r600_asm.h @@ -173,16 +173,25 @@ struct r600_cf_stack_entry { }; #define SQ_MAX_CALL_DEPTH 0x0020 -struct r600_cf_callstack { - unsigned fc_sp_before_entry; - int sub_desc_index; - int current; - int max; -}; #define AR_HANDLE_NORMAL 0 #define AR_HANDLE_RV6XX 1 /* except RV670 */ +struct r600_stack_info { + /* current level of
[Mesa-dev] [PATCH] R600: Emit CF_ALU and use true kcache register.
--- lib/Target/R600/AMDGPU.h | 1 + lib/Target/R600/AMDGPUTargetMachine.cpp| 1 + lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp | 10 +- lib/Target/R600/R600EmitClauseMarkers.cpp | 250 + lib/Target/R600/R600Instructions.td| 79 ++- lib/Target/R600/R600RegisterInfo.td| 63 ++ test/CodeGen/R600/kcache-fold.ll | 2 +- 7 files changed, 395 insertions(+), 11 deletions(-) create mode 100644 lib/Target/R600/R600EmitClauseMarkers.cpp diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h index e099a9f..3cd792a 100644 --- a/lib/Target/R600/AMDGPU.h +++ b/lib/Target/R600/AMDGPU.h @@ -23,6 +23,7 @@ class AMDGPUTargetMachine; // R600 Passes FunctionPass* createR600KernelParametersPass(const DataLayout *TD); FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine tm); +FunctionPass *createR600EmitClauseMarkers(TargetMachine tm); // SI Passes FunctionPass *createSIAnnotateControlFlowPass(); diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp index 0185747..45b1be0 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.cpp +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp @@ -151,6 +151,7 @@ bool AMDGPUPassConfig::addPreEmitPass() { if (ST.device()-getGeneration() = AMDGPUDeviceInfo::HD6XXX) { addPass(createAMDGPUCFGPreparationPass(*TM)); addPass(createAMDGPUCFGStructurizerPass(*TM)); +addPass(createR600EmitClauseMarkers(*TM)); addPass(createR600ExpandSpecialInstrsPass(*TM)); addPass(FinalizeMachineBundlesID); } else { diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp index a945fe9..1bf87fc 100644 --- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp @@ -101,7 +101,8 @@ enum InstrTypes { INSTR_FC, INSTR_NATIVE, INSTR_VTX, - INSTR_EXPORT + INSTR_EXPORT, + INSTR_CFALU }; enum FCInstr { @@ -251,6 +252,13 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS, Emit(Inst, OS); break; } +case AMDGPU::CF_ALU: +case AMDGPU::CF_ALU_PUSH_BEFORE: { + uint64_t Inst = getBinaryCodeForInstr(MI, Fixups); + EmitByte(INSTR_CFALU, OS); + Emit(Inst, OS); + break; +} default: EmitALUInstr(MI, Fixups, OS); diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp b/lib/Target/R600/R600EmitClauseMarkers.cpp new file mode 100644 index 000..3c0e86e --- /dev/null +++ b/lib/Target/R600/R600EmitClauseMarkers.cpp @@ -0,0 +1,250 @@ +//===-- R600EmitClauseMarkers.cpp - Emit CF_ALU ---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===--===// +// +/// \file +/// Add CF_ALU. R600 Alu instructions are grouped in clause which can hold +/// 128 Alu instructions ; these instructions can access up to 4 prefetched +/// 4 lines of 16 registers from constant buffers. Such ALU clauses are +/// initiated by CF_ALU instructions. +//===--===// + +#include AMDGPU.h +#include R600Defines.h +#include R600InstrInfo.h +#include R600MachineFunctionInfo.h +#include R600RegisterInfo.h +#include llvm/CodeGen/MachineFunctionPass.h +#include llvm/CodeGen/MachineInstrBuilder.h +#include llvm/CodeGen/MachineRegisterInfo.h + +namespace llvm { + +class R600EmitClauseMarkersPass : public MachineFunctionPass { + +private: + static char ID; + const R600InstrInfo *TII; + + unsigned OccupiedDwords(MachineInstr *MI) const { +switch (MI-getOpcode()) { +case AMDGPU::INTERP_PAIR_XY: +case AMDGPU::INTERP_PAIR_ZW: +case AMDGPU::INTERP_VEC_LOAD: +case AMDGPU::DOT4_eg_pseudo: +case AMDGPU::DOT4_r600_pseudo: + return 4; +case AMDGPU::KILL: + return 0; +default: + break; +} + +if(TII-isVector(*MI) || +TII-isCubeOp(MI-getOpcode()) || +TII-isReductionOp(MI-getOpcode())) + return 4; + +unsigned NumLiteral = 0; +for (MachineInstr::mop_iterator It = MI-operands_begin(), +E = MI-operands_end(); It != E; ++It) { + MachineOperand MO = *It; + if (MO.isReg() MO.getReg() == AMDGPU::ALU_LITERAL_X) +++NumLiteral; +} +return 1 + NumLiteral; + } + + bool isALU(const MachineInstr *MI) const { +if (TII-isALUInstr(MI-getOpcode())) + return true; +if (TII-isVector(*MI) || TII-isCubeOp(MI-getOpcode())) + return true; +switch (MI-getOpcode()) { +case AMDGPU::INTERP_PAIR_XY: +case AMDGPU::INTERP_PAIR_ZW: +case AMDGPU::INTERP_VEC_LOAD: +case AMDGPU::COPY: +case AMDGPU::DOT4_eg_pseudo: +case
[Mesa-dev] [PATCH 1/2] R600: Emit native instructions for tex
--- lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp | 169 + lib/Target/R600/R600Instructions.td| 156 +++ 2 files changed, 196 insertions(+), 129 deletions(-) diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp index d207160..00ebb44 100644 --- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp @@ -66,8 +66,6 @@ private: void EmitSrcISA(const MCInst MI, unsigned RegOpIdx, unsigned SelOpIdx, raw_ostream OS) const; void EmitDst(const MCInst MI, raw_ostream OS) const; - void EmitTexInstr(const MCInst MI, SmallVectorImplMCFixup Fixups, -raw_ostream OS) const; void EmitFCInstr(const MCInst MI, raw_ostream OS) const; void EmitNullBytes(unsigned int byteCount, raw_ostream OS) const; @@ -140,9 +138,7 @@ MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo MCII, void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS, SmallVectorImplMCFixup Fixups) const { - if (isTexOp(MI.getOpcode())) { -EmitTexInstr(MI, Fixups, OS); - } else if (isFCOp(MI.getOpcode())){ + if (isFCOp(MI.getOpcode())){ EmitFCInstr(MI, OS); } else if (MI.getOpcode() == AMDGPU::RETURN || MI.getOpcode() == AMDGPU::BUNDLE || @@ -175,6 +171,76 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS, Emit(InstWord2, OS); break; } +case AMDGPU::TEX_LD: +case AMDGPU::TEX_GET_TEXTURE_RESINFO: +case AMDGPU::TEX_SAMPLE: +case AMDGPU::TEX_SAMPLE_C: +case AMDGPU::TEX_SAMPLE_L: +case AMDGPU::TEX_SAMPLE_C_L: +case AMDGPU::TEX_SAMPLE_LB: +case AMDGPU::TEX_SAMPLE_C_LB: +case AMDGPU::TEX_SAMPLE_G: +case AMDGPU::TEX_SAMPLE_C_G: +case AMDGPU::TEX_GET_GRADIENTS_H: +case AMDGPU::TEX_GET_GRADIENTS_V: +case AMDGPU::TEX_SET_GRADIENTS_H: +case AMDGPU::TEX_SET_GRADIENTS_V: { + unsigned Opcode = MI.getOpcode(); + bool hasOffsets = (Opcode == AMDGPU::TEX_LD); + unsigned OpOffset = hasOffsets ? 3 : 0; + int64_t Sampler = MI.getOperand(OpOffset + 3).getImm(); + int64_t TextureType = MI.getOperand(OpOffset + 4).getImm(); + + uint32_t srcSelect[4] = {0, 1, 2, 3}; + uint32_t Offsets[3] = {0 , 0, 0}; + uint64_t coordType[4] = {1, 1, 1, 1}; + + if (hasOffsets) +for (unsigned i = 0; i 3; i++) + Offsets[i] = MI.getOperand(i + 2).getImm(); + + if (TextureType == TEXTURE_RECT +|| TextureType == TEXTURE_SHADOWRECT) { +coordType[ELEMENT_X] = 0; +coordType[ELEMENT_Y] = 0; + } + + if (TextureType == TEXTURE_1D_ARRAY + || TextureType == TEXTURE_SHADOW1D_ARRAY) { +if (Opcode == AMDGPU::TEX_SAMPLE_C_L || Opcode == AMDGPU::TEX_SAMPLE_C_LB) { + coordType[ELEMENT_Y] = 0; +} else { + coordType[ELEMENT_Z] = 0; + srcSelect[ELEMENT_Z] = ELEMENT_Y; +} + } else if (TextureType == TEXTURE_2D_ARRAY + || TextureType == TEXTURE_SHADOW2D_ARRAY) { +coordType[ELEMENT_Z] = 0; + } + + + if ((TextureType == TEXTURE_SHADOW1D + || TextureType == TEXTURE_SHADOW2D + || TextureType == TEXTURE_SHADOWRECT + || TextureType == TEXTURE_SHADOW1D_ARRAY) + Opcode != AMDGPU::TEX_SAMPLE_C_L + Opcode != AMDGPU::TEX_SAMPLE_C_LB) { +srcSelect[ELEMENT_W] = ELEMENT_Z; + } + + uint64_t Word01 = getBinaryCodeForInstr(MI, Fixups) | + coordType[ELEMENT_X] 60 | coordType[ELEMENT_Y] 61 | + coordType[ELEMENT_Z] 62 | coordType[ELEMENT_W] 63; + uint32_t Word2 = Sampler 15 | srcSelect[ELEMENT_X] 20 | + srcSelect[ELEMENT_Y] 23 | srcSelect[ELEMENT_Z] 26 | + srcSelect[ELEMENT_W] 29 | Offsets[0] 0 | Offsets[1] 5 | + Offsets[2] 10; + + EmitByte(INSTR_TEX, OS); + Emit(Word01, OS); + Emit(Word2, OS); + break; +} case AMDGPU::EG_ExportSwz: case AMDGPU::R600_ExportSwz: case AMDGPU::EG_ExportBuf: @@ -334,99 +400,6 @@ void R600MCCodeEmitter::EmitSrcISA(const MCInst MI, unsigned RegOpIdx, Emit(InlineConstant.i, OS); } -void R600MCCodeEmitter::EmitTexInstr(const MCInst MI, - SmallVectorImplMCFixup Fixups, - raw_ostream OS) const { - - unsigned Opcode = MI.getOpcode(); - bool hasOffsets = (Opcode == AMDGPU::TEX_LD); - unsigned OpOffset = hasOffsets ? 3 : 0; - int64_t Resource = MI.getOperand(OpOffset + 2).getImm(); - int64_t Sampler = MI.getOperand(OpOffset + 3).getImm(); - int64_t TextureType = MI.getOperand(OpOffset + 4).getImm(); - unsigned srcSelect[4] = {0, 1, 2, 3}; - - // Emit instruction type - EmitByte(1, OS); - - // Emit instruction - EmitByte(getBinaryCodeForInstr(MI,
[Mesa-dev] [PATCH 2/2] R600: Emit CF_ALU and use true kcache register.
--- lib/Target/R600/AMDGPU.h | 1 + lib/Target/R600/AMDGPUTargetMachine.cpp| 1 + lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp | 10 +- lib/Target/R600/R600EmitClauseMarkers.cpp | 243 + lib/Target/R600/R600Instructions.td| 83 ++- lib/Target/R600/R600RegisterInfo.td| 63 ++ 6 files changed, 389 insertions(+), 12 deletions(-) create mode 100644 lib/Target/R600/R600EmitClauseMarkers.cpp diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h index e099a9f..3cd792a 100644 --- a/lib/Target/R600/AMDGPU.h +++ b/lib/Target/R600/AMDGPU.h @@ -23,6 +23,7 @@ class AMDGPUTargetMachine; // R600 Passes FunctionPass* createR600KernelParametersPass(const DataLayout *TD); FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine tm); +FunctionPass *createR600EmitClauseMarkers(TargetMachine tm); // SI Passes FunctionPass *createSIAnnotateControlFlowPass(); diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp index 0185747..45b1be0 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.cpp +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp @@ -151,6 +151,7 @@ bool AMDGPUPassConfig::addPreEmitPass() { if (ST.device()-getGeneration() = AMDGPUDeviceInfo::HD6XXX) { addPass(createAMDGPUCFGPreparationPass(*TM)); addPass(createAMDGPUCFGStructurizerPass(*TM)); +addPass(createR600EmitClauseMarkers(*TM)); addPass(createR600ExpandSpecialInstrsPass(*TM)); addPass(FinalizeMachineBundlesID); } else { diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp index 00ebb44..cf43f3f 100644 --- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp @@ -101,7 +101,8 @@ enum InstrTypes { INSTR_FC, INSTR_NATIVE, INSTR_VTX, - INSTR_EXPORT + INSTR_EXPORT, + INSTR_CFALU }; enum FCInstr { @@ -250,6 +251,13 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS, Emit(Inst, OS); break; } +case AMDGPU::CF_ALU: +case AMDGPU::CF_ALU_PUSH_BEFORE: { + uint64_t Inst = getBinaryCodeForInstr(MI, Fixups); + EmitByte(INSTR_CFALU, OS); + Emit(Inst, OS); + break; +} default: EmitALUInstr(MI, Fixups, OS); diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp b/lib/Target/R600/R600EmitClauseMarkers.cpp new file mode 100644 index 000..b869c88 --- /dev/null +++ b/lib/Target/R600/R600EmitClauseMarkers.cpp @@ -0,0 +1,243 @@ +//===-- R600EmitClauseMarkers.cpp - Emit CF_ALU ---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===--===// +// +/// \file +/// Add CF_ALU. R600 Alu instructions are grouped in clause which can hold +/// 128 Alu instructions ; these instructions can access up to 4 prefetched +/// 4 lines of 16 registers from constant buffers. Such ALU clauses are +/// initiated by CF_ALU instructions. +//===--===// + +#include AMDGPU.h +#include R600Defines.h +#include R600InstrInfo.h +#include R600MachineFunctionInfo.h +#include R600RegisterInfo.h +#include llvm/CodeGen/MachineFunctionPass.h +#include llvm/CodeGen/MachineInstrBuilder.h +#include llvm/CodeGen/MachineRegisterInfo.h + +namespace llvm { + +class R600EmitClauseMarkersPass : public MachineFunctionPass { + +private: + static char ID; + const R600InstrInfo *TII; + + unsigned OccupiedDwords(MachineInstr *MI) const { +switch (MI-getOpcode()) { +case AMDGPU::INTERP_PAIR_XY: +case AMDGPU::INTERP_PAIR_ZW: +case AMDGPU::INTERP_VEC_LOAD: +case AMDGPU::DOT4_eg_pseudo: +case AMDGPU::DOT4_r600_pseudo: + return 4; +case AMDGPU::KILL: + return 0; +default: + break; +} + +if(TII-isVector(*MI) || +TII-isCubeOp(MI-getOpcode()) || +TII-isReductionOp(MI-getOpcode())) + return 4; + +unsigned NumLiteral = 0; +for (MachineInstr::mop_iterator It = MI-operands_begin(), +E = MI-operands_end(); It != E; ++It) { + MachineOperand MO = *It; + if (MO.isReg() MO.getReg() == AMDGPU::ALU_LITERAL_X) +++NumLiteral; +} +return 1 + NumLiteral; + } + + bool isALU(const MachineInstr *MI) const { +if (TII-isALUInstr(MI-getOpcode())) + return true; +if (TII-isVector(*MI) || TII-isCubeOp(MI-getOpcode())) + return true; +switch (MI-getOpcode()) { +case AMDGPU::INTERP_PAIR_XY: +case AMDGPU::INTERP_PAIR_ZW: +case AMDGPU::INTERP_VEC_LOAD: +case AMDGPU::COPY: +case AMDGPU::DOT4_eg_pseudo: +case AMDGPU::DOT4_r600_pseudo: + return true; +default: +
[Mesa-dev] [PATCH 1/2] r600g/llvm: use native encode for tex
--- src/gallium/drivers/r600/r600_shader.c | 50 ++ 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 29facf7..1e21559 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -489,29 +489,33 @@ static unsigned r600_tex_from_byte_stream(struct r600_shader_ctx *ctx, { struct r600_bytecode_tex tex; - tex.op = r600_isa_fetch_by_opcode(ctx-bc-isa, bytes[bytes_read++]); - tex.resource_id = bytes[bytes_read++]; - tex.src_gpr = bytes[bytes_read++]; - tex.src_rel = bytes[bytes_read++]; - tex.dst_gpr = bytes[bytes_read++]; - tex.dst_rel = bytes[bytes_read++]; - tex.dst_sel_x = bytes[bytes_read++]; - tex.dst_sel_y = bytes[bytes_read++]; - tex.dst_sel_z = bytes[bytes_read++]; - tex.dst_sel_w = bytes[bytes_read++]; - tex.lod_bias = bytes[bytes_read++]; - tex.coord_type_x = bytes[bytes_read++]; - tex.coord_type_y = bytes[bytes_read++]; - tex.coord_type_z = bytes[bytes_read++]; - tex.coord_type_w = bytes[bytes_read++]; - tex.offset_x = bytes[bytes_read++]; - tex.offset_y = bytes[bytes_read++]; - tex.offset_z = bytes[bytes_read++]; - tex.sampler_id = bytes[bytes_read++]; - tex.src_sel_x = bytes[bytes_read++]; - tex.src_sel_y = bytes[bytes_read++]; - tex.src_sel_z = bytes[bytes_read++]; - tex.src_sel_w = bytes[bytes_read++]; + uint32_t word0 = i32_from_byte_stream(bytes, bytes_read); + uint32_t word1 = i32_from_byte_stream(bytes, bytes_read); + uint32_t word2 = i32_from_byte_stream(bytes, bytes_read); + + tex.op = r600_isa_fetch_by_opcode(ctx-bc-isa, G_SQ_TEX_WORD0_TEX_INST(word0)); + tex.resource_id = G_SQ_TEX_WORD0_RESOURCE_ID(word0); + tex.src_gpr = G_SQ_TEX_WORD0_SRC_GPR(word0); + tex.src_rel = G_SQ_TEX_WORD0_SRC_REL(word0); + tex.dst_gpr = G_SQ_TEX_WORD1_DST_GPR(word1); + tex.dst_rel = G_SQ_TEX_WORD1_DST_REL(word1); + tex.dst_sel_x = G_SQ_TEX_WORD1_DST_SEL_X(word1); + tex.dst_sel_y = G_SQ_TEX_WORD1_DST_SEL_Y(word1); + tex.dst_sel_z = G_SQ_TEX_WORD1_DST_SEL_Z(word1); + tex.dst_sel_w = G_SQ_TEX_WORD1_DST_SEL_W(word1); + tex.lod_bias = G_SQ_TEX_WORD1_LOD_BIAS(word1); + tex.coord_type_x = G_SQ_TEX_WORD1_COORD_TYPE_X(word1); + tex.coord_type_y = G_SQ_TEX_WORD1_COORD_TYPE_Y(word1); + tex.coord_type_z = G_SQ_TEX_WORD1_COORD_TYPE_Z(word1); + tex.coord_type_w = G_SQ_TEX_WORD1_COORD_TYPE_W(word1); + tex.offset_x = G_SQ_TEX_WORD2_OFFSET_X(word2); + tex.offset_y = G_SQ_TEX_WORD2_OFFSET_Y(word2); + tex.offset_z = G_SQ_TEX_WORD2_OFFSET_Z(word2); + tex.sampler_id = G_SQ_TEX_WORD2_SAMPLER_ID(word2); + tex.src_sel_x = G_SQ_TEX_WORD2_SRC_SEL_X(word2); + tex.src_sel_y = G_SQ_TEX_WORD2_SRC_SEL_Y(word2); + tex.src_sel_z = G_SQ_TEX_WORD2_SRC_SEL_Z(word2); + tex.src_sel_w = G_SQ_TEX_WORD2_SRC_SEL_W(word2); tex.inst_mod = 0; -- 1.8.1.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/2] r600g/llvm: Add support for cf_alu native encode
--- src/gallium/drivers/r600/r600_asm.c| 2 +- src/gallium/drivers/r600/r600_asm.h| 1 + src/gallium/drivers/r600/r600_shader.c | 14 ++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c index 0d570ca..65c705d 100644 --- a/src/gallium/drivers/r600/r600_asm.c +++ b/src/gallium/drivers/r600/r600_asm.c @@ -106,7 +106,7 @@ void r600_bytecode_init(struct r600_bytecode *bc, bc-msaa_texture_mode = msaa_texture_mode; } -static int r600_bytecode_add_cf(struct r600_bytecode *bc) +int r600_bytecode_add_cf(struct r600_bytecode *bc) { struct r600_bytecode_cf *cf = r600_bytecode_cf(); diff --git a/src/gallium/drivers/r600/r600_asm.h b/src/gallium/drivers/r600/r600_asm.h index 1465c31..c1aa3ba 100644 --- a/src/gallium/drivers/r600/r600_asm.h +++ b/src/gallium/drivers/r600/r600_asm.h @@ -227,6 +227,7 @@ int r600_bytecode_add_tex(struct r600_bytecode *bc, int r600_bytecode_add_output(struct r600_bytecode *bc, const struct r600_bytecode_output *output); int r600_bytecode_build(struct r600_bytecode *bc); +int r600_bytecode_add_cf(struct r600_bytecode *bc); int r600_bytecode_add_cfinst(struct r600_bytecode *bc, unsigned op); int r600_bytecode_add_alu_type(struct r600_bytecode *bc, diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 1e21559..6fd1f42 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -626,6 +626,20 @@ static void r600_bytecode_from_byte_stream(struct r600_shader_ctx *ctx, bytes_read = r600_export_from_byte_stream(ctx, bytes, bytes_read); break; + case 6: { + int32_t word0 = i32_from_byte_stream(bytes, bytes_read); + int32_t word1 = i32_from_byte_stream(bytes, bytes_read); + + r600_bytecode_add_cf(ctx-bc); + ctx-bc-cf_last-op = r600_isa_cf_by_opcode(ctx-bc-isa, G_SQ_CF_ALU_WORD1_CF_INST(word1), 1); + ctx-bc-cf_last-kcache[0].bank = G_SQ_CF_ALU_WORD0_KCACHE_BANK0(word0); + ctx-bc-cf_last-kcache[0].addr = G_SQ_CF_ALU_WORD1_KCACHE_ADDR0(word1); + ctx-bc-cf_last-kcache[0].mode = G_SQ_CF_ALU_WORD0_KCACHE_MODE0(word0); + ctx-bc-cf_last-kcache[1].bank = G_SQ_CF_ALU_WORD0_KCACHE_BANK1(word0); + ctx-bc-cf_last-kcache[1].addr = G_SQ_CF_ALU_WORD1_KCACHE_ADDR1(word1); + ctx-bc-cf_last-kcache[1].mode = G_SQ_CF_ALU_WORD1_KCACHE_MODE1(word1); + break; + } default: /* XXX: Error here */ break; -- 1.8.1.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] r600g: Add get/set to handle ALLOC_EXPORT_RAT_WORD0
--- src/gallium/drivers/r600/eg_asm.c | 38 +++ src/gallium/drivers/r600/eg_sq.h | 59 src/gallium/drivers/r600/r600_asm.c| 119 + src/gallium/drivers/r600/r600_asm.h| 8 ++- src/gallium/drivers/r600/r600_shader.c | 34 +++--- 5 files changed, 248 insertions(+), 10 deletions(-) diff --git a/src/gallium/drivers/r600/eg_asm.c b/src/gallium/drivers/r600/eg_asm.c index fffc436..cacb82f 100644 --- a/src/gallium/drivers/r600/eg_asm.c +++ b/src/gallium/drivers/r600/eg_asm.c @@ -106,6 +106,22 @@ int eg_bytecode_cf_build(struct r600_bytecode *bc, struct r600_bytecode_cf *cf) if (bc-chip_class == EVERGREEN) /* no EOP on cayman */ bc-bytecode[id] |= S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf-output.end_of_program); id++; + } else if (cfop-flags CF_MEM) { + /* MEM_RAT_CACHELESS instructions */ + bc-bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RAT_RW_GPR(cf-output.gpr) | + S_SQ_CF_ALLOC_EXPORT_WORD0_RAT_ELEM_SIZE(cf-output.elem_size) | + S_SQ_CF_ALLOC_EXPORT_WORD0_RAT_ID(cf-output.rat_id) | + S_SQ_CF_ALLOC_EXPORT_WORD0_RAT_RAT_INST(cf-output.rat_inst) | + S_SQ_CF_ALLOC_EXPORT_WORD0_RAT_INDEX_GPR(cf-output.index_gpr) | + S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf-output.type); + bc-bytecode[id] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf-output.burst_count - 1) | + S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf-output.barrier) | + S_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(opcode) | + S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK(cf-output.comp_mask) | + S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE(cf-output.array_size); + if (bc-chip_class == EVERGREEN) /* no EOP on cayman */ + bc-bytecode[id] |= S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf-output.end_of_program); + id++; } else { /* branch, loop, call, return instructions */ bc-bytecode[id++] = S_SQ_CF_WORD0_ADDR(cf-cf_addr 1); @@ -118,6 +134,28 @@ int eg_bytecode_cf_build(struct r600_bytecode *bc, struct r600_bytecode_cf *cf) return 0; } +void eg_bytecode_export_rat_read(struct r600_bytecode *bc, + struct r600_bytecode_output *output, uint32_t word0, uint32_t word1) { + output-rat_id = G_SQ_CF_ALLOC_EXPORT_WORD0_RAT_ID(word0); + output-rat_inst = G_SQ_CF_ALLOC_EXPORT_WORD0_RAT_RAT_INST(word0); + output-type = G_SQ_CF_ALLOC_EXPORT_WORD0_RAT_TYPE(word0); + output-gpr = G_SQ_CF_ALLOC_EXPORT_WORD0_RAT_RW_GPR(word0); + output-elem_size = G_SQ_CF_ALLOC_EXPORT_WORD0_RAT_ELEM_SIZE(word0); + output-index_gpr = G_SQ_CF_ALLOC_EXPORT_WORD0_RAT_INDEX_GPR(word0); + + output-swizzle_x = G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X(word1); + output-swizzle_y = G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y(word1); + output-swizzle_z = G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z(word1); + output-swizzle_w = G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W(word1); + output-burst_count = G_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(word1); + output-end_of_program = G_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(word1); + output-op = r600_isa_cf_by_opcode(bc-isa, + G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(word1), /* is_cf_alu = */ 0 ); + output-barrier = G_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(word1); + output-array_size = G_SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE(word1); + output-comp_mask = G_SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK(word1); +} + void eg_bytecode_export_read(struct r600_bytecode *bc, struct r600_bytecode_output *output, uint32_t word0, uint32_t word1) { diff --git a/src/gallium/drivers/r600/eg_sq.h b/src/gallium/drivers/r600/eg_sq.h index b534872..83588de 100644 --- a/src/gallium/drivers/r600/eg_sq.h +++ b/src/gallium/drivers/r600/eg_sq.h @@ -176,6 +176,65 @@ #define G_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(x)(((x) 30) 0x3) #define C_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE 0x3FFF /* done */ +#define P_SQ_CF_ALLOC_EXPORT_WORD0_RAT +#define S_SQ_CF_ALLOC_EXPORT_WORD0_RAT_ID(x) (((x) 0xF) 0) +#define G_SQ_CF_ALLOC_EXPORT_WORD0_RAT_ID(x) (((x) 0) 0xF) +#define S_SQ_CF_ALLOC_EXPORT_WORD0_RAT_RAT_INST(x) (((x) 0x3F) 4) +#define G_SQ_CF_ALLOC_EXPORT_WORD0_RAT_RAT_INST(x) (((x) 4) 0x3F) +#define
[Mesa-dev] [PATCH] R600: Use CONSTANT_BUFFER_0 address space for Implicit Parameters
It allows the backend to generate reads to constant cache which are faster that VTX_READ. --- lib/Target/R600/R600ISelLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index a0e27ea..caa1899 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -522,7 +522,7 @@ SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG DAG, EVT VT, unsigned DwordOffset) const { unsigned ByteOffset = DwordOffset * 4; PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), - AMDGPUAS::PARAM_I_ADDRESS); + AMDGPUAS::CONSTANT_BUFFER_0); // We shouldn't be using an offset wider than 16-bits for implicit parameters. assert(isInt16(ByteOffset)); -- 1.8.1.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH] R600: Relax some vector constraints on Dot4.
Hi Christian, LLVM does indeed coalesce registers for R600 targets, I was however thinking of copies between vectors. For instance, let's say you have 4 vectors coming from instructions that only emit vectors (like TEX_SAMPLE iirc) : If the shader wants to mix them before doing dp4, you end with something like : T0_XYZW = TEX_SAMPLE T1_XYZW = TEX_SAMPLE T2_XYZW = TEX_SAMPLE T3_XYZW = TEX_SAMPLE T0_W = COPY T4_W T1_Z = COPY T3_Z DOT4 T0_XYZW, T1_XYZW From hw point of view, the 2 copies are not necessary because DOT4 instructions does not require that its operands belong to the same 128 bits register. It's perfectly legal to have a bundle like this one : Dot4_eg_real T0_X T1_X Dot4_eg_real T0_Y T1_Y Dot4_eg_real T0_Z T3_Z Dot4_eg_real T4_W T1_W (In fact it is even possible to remove the R600_TReg32_* constraints on the inputs but then you have to ensure the bundle does not read more than 3 gprs from a channel which need much more work) The previous case may seem not so frequent but it still occurs in Lightmark and Unigine Heaven. We represent dot4 inputs as vectors but using 8 scalar inputs is closer from hw capabilities, that's why I wrote this patch. Besides, scalar values usually have shorter live interval, lowering register pressure. Shaders that have a dp4 instructions often end up consuming less registers with this patch. Vincent - Mail original - De : Christian König deathsim...@vodafone.de À : Vincent Lejeune v...@ovi.com Cc : llvm-comm...@cs.uiuc.edu; mesa-dev@lists.freedesktop.org Envoyé le : Vendredi 15 mars 2013 11h18 Objet : Re: [Mesa-dev] [PATCH] R600: Relax some vector constraints on Dot4. Hi Vincent, while I really appreciate your work, I think you're development is going into the wrong direction here. Those copies you're trying to avoid (not only with this patch, but also with the previous REG_SEQUENCE patches), shouldn't happen in the first place. I'm not so deeply into the R600 part of our LLVM backend that I can say that I'm 100% sure, but to me that just looks like workarounds to an incorrect defined register space. Here is an simple example from SI, that should show how things are intended to work. It's a simple 2D texture fetch, the coordinates of that this fetch are usually provided in an two element vector build of VGPRs (I use a 2D fetch just for simplicity, a 3D fetch with explicit LOD would work the same way and would use a four element vector). After ISel the assembler code starts with something like this (simplified): ... %vreg13def,tied1 = V_INTERP_P2_F32 ... ... %vreg17def,tied1 = V_INTERP_P2_F32 ... ... %vreg22def = IMPLICIT_DEF; VReg_64:%vreg22 %vreg21def,tied1 = INSERT_SUBREG %vreg22tied0, %vreg13kill, sub0; VReg_64:%vreg21,%vreg22 VReg_32:%vreg13 %vreg23def,tied1 = INSERT_SUBREG %vreg21tied0, %vreg17kill, sub1; VReg_64:%vreg23,%vreg21 VReg_32:%vreg17 %vreg24def = IMAGE_SAMPLE 15, 0, 0, 0, 0, 0, 0, 0, %vreg23kill, As you can see the sub components of the vectors are inserted/extracted just like it happens on R600, but the registerallocater is capable of handling that much better than on R600 and so avoiding the (sometimes quite expensive) COPY operations in the first place. The resulting code looks like this: ... %vreg23:sub0def,tied1 = V_INTERP_P2_F32 ... ... %vreg23:sub1def,tied1 = V_INTERP_P2_F32 ... ... %vreg24def = IMAGE_SAMPLE 15, 0, 0, 0, 0, 0, 0, 0, %vreg23, ... So INSERT_SUBREG isn't replaced with a COPY like on R600, but instead the V_INTERP_P2_F32 instructions can write directly to the appropriate sub register component. I'm not 100% sure why this doesn't work the same way on R600, but I think it might be a good idea figuring that out. Cheers, Christian. Am 14.03.2013 21:51, schrieb Vincent Lejeune: Dot4 now uses 8 scalar operands instead of 2 vectors one which allows register coalescer to remove some unneeded COPY. This patch also defines some structures/functions that can be used to handle every vector instructions (CUBE, Cayman special instructions...) in a similar fashion. --- lib/Target/R600/AMDGPUISelLowering.h | 1 + lib/Target/R600/R600Defines.h | 74 lib/Target/R600/R600ExpandSpecialInstrs.cpp | 25 lib/Target/R600/R600ISelLowering.cpp | 21 +++ lib/Target/R600/R600InstrInfo.cpp | 88 + lib/Target/R600/R600InstrInfo.h | 5 ++ lib/Target/R600/R600Instructions.td | 51 - lib/Target/R600/R600MachineScheduler.cpp | 2 + 8 files changed, 266 insertions(+), 1 deletion(-) diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index f31b646..f9f5a60 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -125,6 +125,7 @@ enum { SMIN, UMIN
Re: [Mesa-dev] Google Summer of Code ideas needed
Hi, If LLVM backend development is allowed, maybe a student could work on improving VLIW5 scheduling for R600 hardware. So far I focused on VLIW4 architecture, but extending the scheduler to support Trans ALU wouldn't be too hard. This would require a way to represent Trans slot compatibility for instruction in R600Instructions.td, check for additionnal constants read/literals limitation on this slot, and modifying a couple of functions inside R600MachineScheduler.cpp. This may look like a short task but the student would also need some time to get used to all the tools we use, like piglit, and to understand llvm codebase. - Mail original - De : Tom Stellard t...@stellard.net À : mesa-dev@lists.freedesktop.org Cc : Envoyé le : Mercredi 13 mars 2013 18h11 Objet : [Mesa-dev] Google Summer of Code ideas needed Hi, It's time again for Google Summer of Code, so we need to start updating the X.Org ideas page (http://www.x.org/wiki/SummerOfCodeIdeas) with new ideas. Since there have been a few issues with the wikis lately, if you have any ideas please respond to this thread, and I will make sure they get onto the official ideas page (but still feel free to update the wiki page yourself if you can). A good project description should contain: - A brief description of the project - A difficulty rating (e.g. easy, medium, hard) - The skills / programming languages required Also, I am going to purge all the old ideas from the ideas page in the next week, so if there are any of the old ideas that you think are still relevant, let me know and I will keep it. The ideas page is used as one of the criteria by Google for selecting mentoring organizations and part of the reason X.Org was not selected last year was that the ideas page was not up to par, so if we want to participate in Google Summer of Code this year, it is important we have a good ideas page with lots of ideas. Thanks, Tom Stellard ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] R600: Relax some vector constraints on Dot4.
Dot4 now uses 8 scalar operands instead of 2 vectors one which allows register coalescer to remove some unneeded COPY. This patch also defines some structures/functions that can be used to handle every vector instructions (CUBE, Cayman special instructions...) in a similar fashion. --- lib/Target/R600/AMDGPUISelLowering.h| 1 + lib/Target/R600/R600Defines.h | 74 lib/Target/R600/R600ExpandSpecialInstrs.cpp | 25 lib/Target/R600/R600ISelLowering.cpp| 21 +++ lib/Target/R600/R600InstrInfo.cpp | 88 + lib/Target/R600/R600InstrInfo.h | 5 ++ lib/Target/R600/R600Instructions.td | 51 - lib/Target/R600/R600MachineScheduler.cpp| 2 + 8 files changed, 266 insertions(+), 1 deletion(-) diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index f31b646..f9f5a60 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -125,6 +125,7 @@ enum { SMIN, UMIN, URECIP, + DOT4, EXPORT, CONST_ADDRESS, REGISTER_LOAD, diff --git a/lib/Target/R600/R600Defines.h b/lib/Target/R600/R600Defines.h index 16cfcf5..72d83b0 100644 --- a/lib/Target/R600/R600Defines.h +++ b/lib/Target/R600/R600Defines.h @@ -92,6 +92,80 @@ namespace R600Operands { {0,-1,-1,-1,-1, 1, 2, 3, 4, 5,-1, 6, 7, 8, 9,-1,10,11,12,13,14,15,16,17} }; + enum VecOps { +UPDATE_EXEC_MASK_X, +UPDATE_PREDICATE_X, +WRITE_X, +OMOD_X, +DST_REL_X, +CLAMP_X, +SRC0_X, +SRC0_NEG_X, +SRC0_REL_X, +SRC0_ABS_X, +SRC0_SEL_X, +SRC1_X, +SRC1_NEG_X, +SRC1_REL_X, +SRC1_ABS_X, +SRC1_SEL_X, +PRED_SEL_X, +UPDATE_EXEC_MASK_Y, +UPDATE_PREDICATE_Y, +WRITE_Y, +OMOD_Y, +DST_REL_Y, +CLAMP_Y, +SRC0_Y, +SRC0_NEG_Y, +SRC0_REL_Y, +SRC0_ABS_Y, +SRC0_SEL_Y, +SRC1_Y, +SRC1_NEG_Y, +SRC1_REL_Y, +SRC1_ABS_Y, +SRC1_SEL_Y, +PRED_SEL_Y, +UPDATE_EXEC_MASK_Z, +UPDATE_PREDICATE_Z, +WRITE_Z, +OMOD_Z, +DST_REL_Z, +CLAMP_Z, +SRC0_Z, +SRC0_NEG_Z, +SRC0_REL_Z, +SRC0_ABS_Z, +SRC0_SEL_Z, +SRC1_Z, +SRC1_NEG_Z, +SRC1_REL_Z, +SRC1_ABS_Z, +SRC1_SEL_Z, +PRED_SEL_Z, +UPDATE_EXEC_MASK_W, +UPDATE_PREDICATE_W, +WRITE_W, +OMOD_W, +DST_REL_W, +CLAMP_W, +SRC0_W, +SRC0_NEG_W, +SRC0_REL_W, +SRC0_ABS_W, +SRC0_SEL_W, +SRC1_W, +SRC1_NEG_W, +SRC1_REL_W, +SRC1_ABS_W, +SRC1_SEL_W, +PRED_SEL_W, +IMM_0, +IMM_1, +VEC_COUNT + }; + } #endif // R600DEFINES_H_ diff --git a/lib/Target/R600/R600ExpandSpecialInstrs.cpp b/lib/Target/R600/R600ExpandSpecialInstrs.cpp index f8c900f..993bdad 100644 --- a/lib/Target/R600/R600ExpandSpecialInstrs.cpp +++ b/lib/Target/R600/R600ExpandSpecialInstrs.cpp @@ -182,6 +182,31 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction MF) { MI.eraseFromParent(); continue; } + case AMDGPU::DOT_4: { + +const R600RegisterInfo TRI = TII-getRegisterInfo(); + +unsigned DstReg = MI.getOperand(0).getReg(); +unsigned DstBase = TRI.getEncodingValue(DstReg) HW_REG_MASK; + +for (unsigned Chan = 0; Chan 4; ++Chan) { + bool Mask = (Chan != TRI.getHWRegChan(DstReg)); + unsigned SubDstReg = + AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan); + MachineInstr *BMI = + TII-buildSlotOfVectorInstruction(MBB, MI, Chan, SubDstReg); + if (Chan 0) { +BMI-bundleWithPred(); + } + if (Mask) { +TII-addFlag(BMI, 0, MO_FLAG_MASK); + } + if (Chan != 3) +TII-addFlag(BMI, 0, MO_FLAG_NOT_LAST); +} +MI.eraseFromParent(); +continue; + } } bool IsReduction = TII-isReductionOp(MI.getOpcode()); diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index a73691d..4868dc7 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -394,6 +394,27 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG DAG) const return SDValue(interp, slot % 2); } +case AMDGPUIntrinsic::AMDGPU_dp4: { + SDValue Args[8] = { + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), + DAG.getConstant(0, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), + DAG.getConstant(0, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), + DAG.getConstant(1, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), + DAG.getConstant(1, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), +
[Mesa-dev] [PATCH] R600: Factorize code handling Const Read Port limitation
--- lib/Target/R600/AMDILISelDAGToDAG.cpp| 34 ++ lib/Target/R600/R600InstrInfo.cpp| 54 ++ lib/Target/R600/R600InstrInfo.h | 3 ++ lib/Target/R600/R600MachineScheduler.cpp | 77 lib/Target/R600/R600MachineScheduler.h | 3 +- test/CodeGen/R600/kcache-fold-2.ll | 52 + 6 files changed, 144 insertions(+), 79 deletions(-) create mode 100644 test/CodeGen/R600/kcache-fold-2.ll diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp b/lib/Target/R600/AMDILISelDAGToDAG.cpp index 0c7880d..05a1ea7 100644 --- a/lib/Target/R600/AMDILISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp @@ -336,6 +336,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { return Result; } + bool AMDGPUDAGToDAGISel::FoldOperands(unsigned Opcode, const R600InstrInfo *TII, std::vectorSDValue Ops) { int OperandIdx[] = { @@ -365,17 +366,34 @@ bool AMDGPUDAGToDAGISel::FoldOperands(unsigned Opcode, SDValue Operand = Ops[OperandIdx[i] - 1]; switch (Operand.getOpcode()) { case AMDGPUISD::CONST_ADDRESS: { - if (i == 2) -break; SDValue CstOffset; - if (!Operand.getValueType().isVector() - SelectGlobalValueConstantOffset(Operand.getOperand(0), CstOffset)) { -Ops[OperandIdx[i] - 1] = CurDAG-getRegister(AMDGPU::ALU_CONST, MVT::f32); -Ops[SelIdx[i] - 1] = CstOffset; -return true; + if (Operand.getValueType().isVector() || + !SelectGlobalValueConstantOffset(Operand.getOperand(0), CstOffset)) +break; + + // Gather others constants values + std::vectorunsigned Consts; + for (unsigned j = 0; j 3; j++) { +int SrcIdx = OperandIdx[j]; +if (SrcIdx 0) + break; +if (RegisterSDNode *Reg = dyn_castRegisterSDNode(Ops[SrcIdx - 1])) { + if (Reg-getReg() == AMDGPU::ALU_CONST) { +ConstantSDNode *Cst = dyn_castConstantSDNode(Ops[SelIdx[j] - 1]); +Consts.push_back(Cst-getZExtValue()); + } +} } + + ConstantSDNode *Cst = dyn_castConstantSDNode(CstOffset); + Consts.push_back(Cst-getZExtValue()); + if (!TII-fitsConstReadLimitations(Consts)) +break; + + Ops[OperandIdx[i] - 1] = CurDAG-getRegister(AMDGPU::ALU_CONST, MVT::f32); + Ops[SelIdx[i] - 1] = CstOffset; + return true; } - break; case ISD::FNEG: if (NegIdx[i] 0) break; diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp index be3318a..0865098 100644 --- a/lib/Target/R600/R600InstrInfo.cpp +++ b/lib/Target/R600/R600InstrInfo.cpp @@ -139,6 +139,60 @@ bool R600InstrInfo::isALUInstr(unsigned Opcode) const { (TargetFlags R600_InstFlag::OP3)); } +bool +R600InstrInfo::fitsConstReadLimitations(const std::vectorunsigned Consts) +const { + assert (Consts.size() = 12 Too many operands in instructions group); + unsigned Pair1 = 0, Pair2 = 0; + for (unsigned i = 0, n = Consts.size(); i n; ++i) { +unsigned ReadConstHalf = Consts[i] 2; +unsigned ReadConstIndex = Consts[i] (~3); +unsigned ReadHalfConst = ReadConstIndex | ReadConstHalf; +if (!Pair1) { + Pair1 = ReadHalfConst; + continue; +} +if (Pair1 == ReadHalfConst) + continue; +if (!Pair2) { + Pair2 = ReadHalfConst; + continue; +} +if (Pair2 != ReadHalfConst) + return false; + } + return true; +} + +bool +R600InstrInfo::canBundle(const std::vectorMachineInstr * MIs) const { + std::vectorunsigned Consts; + for (unsigned i = 0, n = MIs.size(); i n; i++) { +const MachineInstr *MI = MIs[i]; + +const R600Operands::Ops OpTable[3][2] = { + {R600Operands::SRC0, R600Operands::SRC0_SEL}, + {R600Operands::SRC1, R600Operands::SRC1_SEL}, + {R600Operands::SRC2, R600Operands::SRC2_SEL}, +}; + +if (!isALUInstr(MI-getOpcode())) + continue; + +for (unsigned j = 0; j 3; j++) { + int SrcIdx = getOperandIdx(MI-getOpcode(), OpTable[j][0]); + if (SrcIdx 0) +break; + if (MI-getOperand(SrcIdx).getReg() == AMDGPU::ALU_CONST) { +unsigned Const = MI-getOperand( +getOperandIdx(MI-getOpcode(), OpTable[j][1])).getImm(); +Consts.push_back(Const); + } +} + } + return fitsConstReadLimitations(Consts); +} + DFAPacketizer *R600InstrInfo::CreateTargetScheduleState(const TargetMachine *TM, const ScheduleDAG *DAG) const { const InstrItineraryData *II = TM-getInstrItineraryData(); diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h index efe721c..bf9569e 100644 --- a/lib/Target/R600/R600InstrInfo.h +++ b/lib/Target/R600/R600InstrInfo.h @@ -53,6 +53,9 @@ namespace llvm { /// \returns true if this \p Opcode represents an ALU instruction. bool isALUInstr(unsigned Opcode) const; + bool fitsConstReadLimitations(const std::vectorunsigned) const; +
[Mesa-dev] [PATCH] R600: Lower clamp constant to constant
--- lib/Target/R600/R600ISelLowering.cpp | 23 +++ test/CodeGen/R600/clamp-constants.ll | 20 2 files changed, 43 insertions(+) create mode 100644 test/CodeGen/R600/clamp-constants.ll diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index a73691d..96686e6 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -394,6 +394,29 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG DAG) const return SDValue(interp, slot % 2); } +case AMDGPUIntrinsic::AMDIL_clamp: { + ConstantFPSDNode *Min = dyn_castConstantFPSDNode(Op.getOperand(2)); + ConstantFPSDNode *Max = dyn_castConstantFPSDNode(Op.getOperand(3)); + if (ConstantFPSDNode *C = dyn_castConstantFPSDNode(Op.getOperand(1))) { +switch (C-getValueAPF().compare(Max-getValueAPF())) { +case APFloat::cmpGreaterThan: +case APFloat::cmpEqual: + return Op.getOperand(3); +default: + break; +} + +switch (C-getValueAPF().compare(Min-getValueAPF())) { +case APFloat::cmpLessThan: +case APFloat::cmpEqual: + return Op.getOperand(2); +default: + break; +} +return Op.getOperand(1); + } + break; +} case r600_read_ngroups_x: return LowerImplicitParameter(DAG, VT, DL, 0); diff --git a/test/CodeGen/R600/clamp-constants.ll b/test/CodeGen/R600/clamp-constants.ll new file mode 100644 index 000..cf4d35f --- /dev/null +++ b/test/CodeGen/R600/clamp-constants.ll @@ -0,0 +1,20 @@ +;RUN: llc %s -march=r600 -mcpu=redwood | FileCheck %s + +;CHECK-NOT: MOV + +define void @main() { +main_body: + %0 = call float @llvm.AMDIL.clamp.(float 1.50e+00, float 0.00e+00, float 1.00e+00) + %1 = call float @llvm.AMDIL.clamp.(float 0.00e+00, float 0.00e+00, float 1.00e+00) + %2 = call float @llvm.AMDIL.clamp.(float 1.00e+00, float 0.00e+00, float 1.00e+00) + %3 = call float @llvm.AMDIL.clamp.(float -0.50e+00, float 0.00e+00, float 1.00e+00) + %4 = insertelement 4 x float undef, float %0, i32 0 + %5 = insertelement 4 x float %4, float %1, i32 1 + %6 = insertelement 4 x float %5, float %2, i32 2 + %7 = insertelement 4 x float %6, float %3, i32 3 + call void @llvm.R600.store.swizzle(4 x float %7, i32 0, i32 0) + ret void +} + +declare float @llvm.AMDIL.clamp.(float, float, float) readnone +declare void @llvm.R600.store.swizzle(4 x float, i32, i32) -- 1.8.1.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH] R600: Factorize code handling Const Read Port limitation
I fixed the coding style issue. The iostream include was a debug leftover line, it shouldn't be there. - Mail original - De : Tom Stellard t...@stellard.net À : Vincent Lejeune v...@ovi.com Cc : llvm-comm...@cs.uiuc.edu; mesa-dev@lists.freedesktop.org Envoyé le : Mercredi 13 mars 2013 21h49 Objet : Re: [PATCH] R600: Factorize code handling Const Read Port limitation On Wed, Mar 13, 2013 at 09:12:41PM +0100, Vincent Lejeune wrote: --- lib/Target/R600/AMDILISelDAGToDAG.cpp | 34 ++ lib/Target/R600/R600InstrInfo.cpp | 54 ++ lib/Target/R600/R600InstrInfo.h | 3 ++ lib/Target/R600/R600MachineScheduler.cpp | 77 lib/Target/R600/R600MachineScheduler.h | 3 +- test/CodeGen/R600/kcache-fold-2.ll | 52 + 6 files changed, 144 insertions(+), 79 deletions(-) create mode 100644 test/CodeGen/R600/kcache-fold-2.ll diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp b/lib/Target/R600/AMDILISelDAGToDAG.cpp index 0c7880d..05a1ea7 100644 --- a/lib/Target/R600/AMDILISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp @@ -336,6 +336,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { return Result; } + Whitespace bool AMDGPUDAGToDAGISel::FoldOperands(unsigned Opcode, const R600InstrInfo *TII, std::vectorSDValue Ops) { int OperandIdx[] = { @@ -365,17 +366,34 @@ bool AMDGPUDAGToDAGISel::FoldOperands(unsigned Opcode, SDValue Operand = Ops[OperandIdx[i] - 1]; switch (Operand.getOpcode()) { case AMDGPUISD::CONST_ADDRESS: { - if (i == 2) - break; SDValue CstOffset; - if (!Operand.getValueType().isVector() - SelectGlobalValueConstantOffset(Operand.getOperand(0), CstOffset)) { - Ops[OperandIdx[i] - 1] = CurDAG-getRegister(AMDGPU::ALU_CONST, MVT::f32); - Ops[SelIdx[i] - 1] = CstOffset; - return true; + if (Operand.getValueType().isVector() || + !SelectGlobalValueConstantOffset(Operand.getOperand(0), CstOffset)) + break; + + // Gather others constants values + std::vectorunsigned Consts; + for (unsigned j = 0; j 3; j++) { + int SrcIdx = OperandIdx[j]; + if (SrcIdx 0) + break; + if (RegisterSDNode *Reg = dyn_castRegisterSDNode(Ops[SrcIdx - 1])) { + if (Reg-getReg() == AMDGPU::ALU_CONST) { + ConstantSDNode *Cst = dyn_castConstantSDNode(Ops[SelIdx[j] - 1]); + Consts.push_back(Cst-getZExtValue()); + } + } } + + ConstantSDNode *Cst = dyn_castConstantSDNode(CstOffset); + Consts.push_back(Cst-getZExtValue()); + if (!TII-fitsConstReadLimitations(Consts)) + break; + + Ops[OperandIdx[i] - 1] = CurDAG-getRegister(AMDGPU::ALU_CONST, MVT::f32); + Ops[SelIdx[i] - 1] = CstOffset; + return true; } - break; case ISD::FNEG: if (NegIdx[i] 0) break; diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp index be3318a..0865098 100644 --- a/lib/Target/R600/R600InstrInfo.cpp +++ b/lib/Target/R600/R600InstrInfo.cpp @@ -139,6 +139,60 @@ bool R600InstrInfo::isALUInstr(unsigned Opcode) const { (TargetFlags R600_InstFlag::OP3)); } +bool +R600InstrInfo::fitsConstReadLimitations(const std::vectorunsigned Consts) + const { + assert (Consts.size() = 12 Too many operands in instructions group); + unsigned Pair1 = 0, Pair2 = 0; + for (unsigned i = 0, n = Consts.size(); i n; ++i) { + unsigned ReadConstHalf = Consts[i] 2; + unsigned ReadConstIndex = Consts[i] (~3); + unsigned ReadHalfConst = ReadConstIndex | ReadConstHalf; + if (!Pair1) { + Pair1 = ReadHalfConst; + continue; + } + if (Pair1 == ReadHalfConst) + continue; + if (!Pair2) { + Pair2 = ReadHalfConst; + continue; + } + if (Pair2 != ReadHalfConst) + return false; + } + return true; +} + +bool +R600InstrInfo::canBundle(const std::vectorMachineInstr * MIs) const { + std::vectorunsigned Consts; + for (unsigned i = 0, n = MIs.size(); i n; i++) { + const MachineInstr *MI = MIs[i]; + + const R600Operands::Ops OpTable[3][2] = { + {R600Operands::SRC0, R600Operands::SRC0_SEL}, + {R600Operands::SRC1, R600Operands::SRC1_SEL}, + {R600Operands::SRC2, R600Operands::SRC2_SEL}, + }; + + if (!isALUInstr(MI-getOpcode())) + continue; + + for (unsigned j = 0; j 3; j++) { + int SrcIdx = getOperandIdx(MI-getOpcode(), OpTable[j][0]); + if (SrcIdx 0) + break; + if (MI-getOperand(SrcIdx).getReg() == AMDGPU::ALU_CONST) { + unsigned Const = MI-getOperand( + getOperandIdx(MI
[Mesa-dev] [PATCH] R600: Fix JUMP handling so that MachineInstr verification can occur
This allows R600 Target to use the newly created -verify-misched llc flag --- lib/Target/R600/AMDILCFGStructurizer.cpp| 8 +- lib/Target/R600/R600ISelLowering.cpp| 7 +- lib/Target/R600/R600InstrInfo.cpp | 66 ++-- lib/Target/R600/R600Instructions.td | 26 +++-- test/CodeGen/R600/schedule-fs-loop-nested-if.ll | 82 +++ test/CodeGen/R600/schedule-fs-loop-nested.ll| 87 test/CodeGen/R600/schedule-fs-loop.ll | 54 ++ test/CodeGen/R600/schedule-vs-if-nested-loop.ll | 133 8 files changed, 418 insertions(+), 45 deletions(-) create mode 100644 test/CodeGen/R600/schedule-fs-loop-nested-if.ll create mode 100644 test/CodeGen/R600/schedule-fs-loop-nested.ll create mode 100644 test/CodeGen/R600/schedule-fs-loop.ll create mode 100644 test/CodeGen/R600/schedule-vs-if-nested-loop.ll diff --git a/lib/Target/R600/AMDILCFGStructurizer.cpp b/lib/Target/R600/AMDILCFGStructurizer.cpp index aa8ab6b..b0cd0f9 100644 --- a/lib/Target/R600/AMDILCFGStructurizer.cpp +++ b/lib/Target/R600/AMDILCFGStructurizer.cpp @@ -2595,6 +2595,7 @@ struct CFGStructTraitsAMDGPUCFGStructurizer { static int getBranchNzeroOpcode(int oldOpcode) { switch(oldOpcode) { +case AMDGPU::JUMP_COND: case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET; case AMDGPU::BRANCH_COND_i32: case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32; @@ -2606,6 +2607,7 @@ struct CFGStructTraitsAMDGPUCFGStructurizer { static int getBranchZeroOpcode(int oldOpcode) { switch(oldOpcode) { +case AMDGPU::JUMP_COND: case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET; case AMDGPU::BRANCH_COND_i32: case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32; @@ -2617,6 +2619,7 @@ struct CFGStructTraitsAMDGPUCFGStructurizer { static int getContinueNzeroOpcode(int oldOpcode) { switch(oldOpcode) { +case AMDGPU::JUMP_COND: case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32; default: assert(0 internal error); @@ -2626,6 +2629,7 @@ struct CFGStructTraitsAMDGPUCFGStructurizer { static int getContinueZeroOpcode(int oldOpcode) { switch(oldOpcode) { +case AMDGPU::JUMP_COND: case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32; default: assert(0 internal error); @@ -2654,8 +2658,7 @@ struct CFGStructTraitsAMDGPUCFGStructurizer { static bool isCondBranch(MachineInstr *instr) { switch (instr-getOpcode()) { - case AMDGPU::JUMP: -return instr-getOperand(instr-findFirstPredOperandIdx()).getReg() != 0; + case AMDGPU::JUMP_COND: case AMDGPU::BRANCH_COND_i32: case AMDGPU::BRANCH_COND_f32: break; @@ -2668,7 +2671,6 @@ struct CFGStructTraitsAMDGPUCFGStructurizer { static bool isUncondBranch(MachineInstr *instr) { switch (instr-getOpcode()) { case AMDGPU::JUMP: - return instr-getOperand(instr-findFirstPredOperandIdx()).getReg() == 0; case AMDGPU::BRANCH: return true; default: diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index 6ee4c8f..a73691d 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -221,8 +221,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( case AMDGPU::BRANCH: BuildMI(*BB, I, BB-findDebugLoc(I), TII-get(AMDGPU::JUMP)) - .addOperand(MI-getOperand(0)) - .addReg(0); + .addOperand(MI-getOperand(0)); break; case AMDGPU::BRANCH_COND_f32: { @@ -233,7 +232,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( .addImm(OPCODE_IS_NOT_ZERO) .addImm(0); // Flags TII-addFlag(NewMI, 0, MO_FLAG_PUSH); -BuildMI(*BB, I, BB-findDebugLoc(I), TII-get(AMDGPU::JUMP)) +BuildMI(*BB, I, BB-findDebugLoc(I), TII-get(AMDGPU::JUMP_COND)) .addOperand(MI-getOperand(0)) .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); break; @@ -247,7 +246,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( .addImm(OPCODE_IS_NOT_ZERO_INT) .addImm(0); // Flags TII-addFlag(NewMI, 0, MO_FLAG_PUSH); -BuildMI(*BB, I, BB-findDebugLoc(I), TII-get(AMDGPU::JUMP)) +BuildMI(*BB, I, BB-findDebugLoc(I), TII-get(AMDGPU::JUMP_COND)) .addOperand(MI-getOperand(0)) .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); break; diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp index 106bbc0..be3318a 100644 --- a/lib/Target/R600/R600InstrInfo.cpp +++ b/lib/Target/R600/R600InstrInfo.cpp @@ -168,6 +168,11 @@ findFirstPredicateSetterFrom(MachineBasicBlock MBB, return NULL; } +static +bool isJump(unsigned Opcode) { + return Opcode == AMDGPU::JUMP || Opcode == AMDGPU::JUMP_COND; +} + bool
Re: [Mesa-dev] [PATCH 7/7] radeon/llvm: enable LICM and DCE pass
Reviewed-by: Vincent Lejeune vljn at ovi.com - Mail original - De : Christian König deathsim...@vodafone.de À : mesa-dev@lists.freedesktop.org Cc : mic...@daenzer.net Envoyé le : Mardi 5 mars 2013 15h27 Objet : [Mesa-dev] [PATCH 7/7] radeon/llvm: enable LICM and DCE pass From: Christian König christian.koe...@amd.com Signed-off-by: Christian König christian.koe...@amd.com --- .../drivers/radeon/radeon_setup_tgsi_llvm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c index f7b7586..f017b87 100644 --- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c +++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c @@ -1281,6 +1281,8 @@ void radeon_llvm_finalize_module(struct radeon_llvm_context * ctx) /* Add some optimization passes */ LLVMAddScalarReplAggregatesPass(gallivm-passmgr); + LLVMAddLICMPass(gallivm-passmgr); + LLVMAddAggressiveDCEPass(gallivm-passmgr); LLVMAddCFGSimplificationPass(gallivm-passmgr); /* Run the passs */ -- 1.7.9.5 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 7/7] radeon/llvm: enable LICM and DCE pass
LICM stands for Loop Invariant Code Motion. Instructions that does not depend of loop index are moved outside of loop body. (This solves one of llvm generated code Vadim pointed in another thread) DCE is DeadCodeElimination...I don't know the difference between classic DCE and aggressive DCE though. - Mail original - De : Michel Dänzer mic...@daenzer.net À : Christian König deathsim...@vodafone.de Cc : mesa-dev@lists.freedesktop.org Envoyé le : Mardi 5 mars 2013 18h20 Objet : Re: [Mesa-dev] [PATCH 7/7] radeon/llvm: enable LICM and DCE pass On Die, 2013-03-05 at 15:27 +0100, Christian König wrote: From: Christian König christian.koe...@amd.com Signed-off-by: Christian König christian.koe...@amd.com This could use a little more information, e.g.: What are LICM and DCE? Why is it a good idea to enable them? -- Earthling Michel Dänzer | http://www.amd.com Libre software enthusiast | Debian, X and DRI developer ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/6] R600: Use MUL_IEEE for trig/fdiv intrinsic
--- lib/Target/R600/R600Instructions.td | 8 test/CodeGen/R600/fdiv.v4f32.ll | 8 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index 0a01400..e4cc06e 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -1090,12 +1090,12 @@ class COS_Common bits11 inst : R600_1OP multiclass DIV_Common InstR600 recip_ieee { def : Pat (int_AMDGPU_div R600_Reg32:$src0, R600_Reg32:$src1), - (MUL R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1)) + (MUL_IEEE R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1)) ; def : Pat (fdiv R600_Reg32:$src0, R600_Reg32:$src1), - (MUL R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1)) + (MUL_IEEE R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1)) ; } @@ -1169,12 +1169,12 @@ let Predicates = [isR600] in { // cards. class COS_PAT InstR600 trig : Pat (fcos R600_Reg32:$src), - (trig (MUL (MOV_IMM_I32 CONST.TWO_PI_INV), R600_Reg32:$src)) + (trig (MUL_IEEE (MOV_IMM_I32 CONST.TWO_PI_INV), R600_Reg32:$src)) ; class SIN_PAT InstR600 trig : Pat (fsin R600_Reg32:$src), - (trig (MUL (MOV_IMM_I32 CONST.TWO_PI_INV), R600_Reg32:$src)) + (trig (MUL_IEEE (MOV_IMM_I32 CONST.TWO_PI_INV), R600_Reg32:$src)) ; //===--===// diff --git a/test/CodeGen/R600/fdiv.v4f32.ll b/test/CodeGen/R600/fdiv.v4f32.ll index b013fd6..459fd11 100644 --- a/test/CodeGen/R600/fdiv.v4f32.ll +++ b/test/CodeGen/R600/fdiv.v4f32.ll @@ -1,13 +1,13 @@ ;RUN: llc %s -march=r600 -mcpu=redwood | FileCheck %s ;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} define void @test(4 x float addrspace(1)* %out, 4 x float addrspace(1)* %in) { %b_ptr = getelementptr 4 x float addrspace(1)* %in, i32 1 -- 1.8.1.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/6] R600: CONST_ADDRESS node is not marked as mayLoad anymore
mayLoad complexify scheduling and does not bring any usefull info as the location is not writeable at all. --- lib/Target/R600/R600Instructions.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index e4cc06e..0a777f1 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -513,7 +513,7 @@ def INTERP_PAIR_ZW : AMDGPUShaderInst def CONST_ADDRESS: SDNodeAMDGPUISD::CONST_ADDRESS, SDTypeProfile1, -1, [SDTCisInt0, SDTCisPtrTy1], - [SDNPMayLoad, SDNPVariadic] + [SDNPVariadic] ; //===--===// -- 1.8.1.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 3/6] R600: Turn BUILD_VECTOR into Reg_Sequence
--- lib/Target/R600/AMDILISelDAGToDAG.cpp | 29 + 1 file changed, 29 insertions(+) diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp b/lib/Target/R600/AMDILISelDAGToDAG.cpp index 2e726e9..6b24117 100644 --- a/lib/Target/R600/AMDILISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp @@ -160,6 +160,35 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { } switch (Opc) { default: break; + case ISD::BUILD_VECTOR: { +const AMDGPUSubtarget ST = TM.getSubtargetAMDGPUSubtarget(); +if (ST.device()-getGeneration() AMDGPUDeviceInfo::HD6XXX) { + break; +} +// BUILD_VECTOR is usually lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG +// that adds a 128 bits reg copy when going through TwoAddressInstructions +// pass. We want to avoid 128 bits copies as much as possible because they +// can't be bundled by our scheduler. +SDValue RegSeqArgs[9] = { + CurDAG-getTargetConstant(AMDGPU::R600_Reg128RegClassID, MVT::i32), + SDValue(), CurDAG-getTargetConstant(AMDGPU::sub0, MVT::i32), + SDValue(), CurDAG-getTargetConstant(AMDGPU::sub1, MVT::i32), + SDValue(), CurDAG-getTargetConstant(AMDGPU::sub2, MVT::i32), + SDValue(), CurDAG-getTargetConstant(AMDGPU::sub3, MVT::i32) +}; +bool IsRegSeq = true; +for (unsigned i = 0; i N-getNumOperands(); i++) { + if (dyn_castRegisterSDNode(N-getOperand(i))) { +IsRegSeq = false; +break; + } + RegSeqArgs[2 * i + 1] = N-getOperand(i); +} +if (!IsRegSeq) + break; +return CurDAG-SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N-getVTList(), +RegSeqArgs, 2 * N-getNumOperands() + 1); + } case ISD::ConstantFP: case ISD::Constant: { const AMDGPUSubtarget ST = TM.getSubtargetAMDGPUSubtarget(); -- 1.8.1.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 4/6] R600: Fix for Unigine when MachineSched is enabled
--- lib/Target/R600/R600Instructions.td | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index 0a777f1..74106c9 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -1587,6 +1587,7 @@ def PRED_X : InstR600 (ins R600_Reg32:$src0, i32imm:$src1, i32imm:$flags), , [], NullALU { let FlagOperandIdx = 3; + let isTerminator = 1; } let isTerminator = 1, isBranch = 1, isBarrier = 1 in { -- 1.8.1.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 5/6] R600: Remove LowerConstCopyPass and lower CONST_COPY right after ISel.
Maintaining CONST_COPY Instructions until Pre Emit may prevent some ifcvt case and taking them in account for scheduling is difficult for no real benefit. --- lib/Target/R600/AMDGPU.h| 1 - lib/Target/R600/AMDGPUTargetMachine.cpp | 1 - lib/Target/R600/R600ISelLowering.cpp| 8 +- lib/Target/R600/R600Instructions.td | 7 +- lib/Target/R600/R600LowerConstCopy.cpp | 222 5 files changed, 11 insertions(+), 228 deletions(-) delete mode 100644 lib/Target/R600/R600LowerConstCopy.cpp diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h index ba87918..67073ab 100644 --- a/lib/Target/R600/AMDGPU.h +++ b/lib/Target/R600/AMDGPU.h @@ -23,7 +23,6 @@ class AMDGPUTargetMachine; // R600 Passes FunctionPass* createR600KernelParametersPass(const DataLayout *TD); FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine tm); -FunctionPass *createR600LowerConstCopy(TargetMachine tm); // SI Passes FunctionPass *createSIAnnotateControlFlowPass(); diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp index e2f00be..70b34b0 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.cpp +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp @@ -143,7 +143,6 @@ bool AMDGPUPassConfig::addPreEmitPass() { addPass(createAMDGPUCFGStructurizerPass(*TM)); addPass(createR600ExpandSpecialInstrsPass(*TM)); addPass(FinalizeMachineBundlesID); -addPass(createR600LowerConstCopy(*TM)); } else { addPass(createSILowerControlFlowPass(*TM)); } diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index ece0b9a..f25ced1 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -150,7 +150,13 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( TII-buildMovImm(*BB, I, MI-getOperand(0).getReg(), MI-getOperand(1).getImm()); break; - + case AMDGPU::CONST_COPY: { +MachineInstr *NewMI = TII-buildDefaultInstruction(*BB, MI, AMDGPU::MOV, +MI-getOperand(0).getReg(), AMDGPU::ALU_CONST); +TII-setImmOperand(NewMI, R600Operands::SRC0_SEL, +MI-getOperand(1).getImm()); +break; + } case AMDGPU::RAT_WRITE_CACHELESS_32_eg: case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index 74106c9..10bcdcf 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -1650,17 +1650,18 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1, // Constant Buffer Addressing Support //===--===// -let isCodeGenOnly = 1, isPseudo = 1, Namespace = AMDGPU in { +let usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = AMDGPU in { def CONST_COPY : Instruction { let OutOperandList = (outs R600_Reg32:$dst); let InOperandList = (ins i32imm:$src); - let Pattern = [(set R600_Reg32:$dst, (CONST_ADDRESS ADDRGA_CONST_OFFSET:$src))]; + let Pattern = + [(set R600_Reg32:$dst, (CONST_ADDRESS ADDRGA_CONST_OFFSET:$src))]; let AsmString = CONST_COPY; let neverHasSideEffects = 1; let isAsCheapAsAMove = 1; let Itinerary = NullALU; } -} // end isCodeGenOnly = 1, isPseudo = 1, Namespace = AMDGPU +} // end usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = AMDGPU def TEX_VTX_CONSTBUF : InstR600ISA (outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), VTX_READ_eg $dst, $ptr, diff --git a/lib/Target/R600/R600LowerConstCopy.cpp b/lib/Target/R600/R600LowerConstCopy.cpp deleted file mode 100644 index 3ebe653..000 --- a/lib/Target/R600/R600LowerConstCopy.cpp +++ /dev/null @@ -1,222 +0,0 @@ -//===-- R600LowerConstCopy.cpp - Propagate ConstCopy / lower them to MOV---===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===--===// -// -/// \file -/// This pass is intended to handle remaining ConstCopy pseudo MachineInstr. -/// ISel will fold each Const Buffer read inside scalar ALU. However it cannot -/// fold them inside vector instruction, like DOT4 or Cube ; ISel emits -/// ConstCopy instead. This pass (executed after ExpandingSpecialInstr) will try -/// to fold them if possible or replace them by MOV otherwise. -// -//===--===// - -#include AMDGPU.h -#include R600InstrInfo.h -#include llvm/CodeGen/MachineFunction.h -#include llvm/CodeGen/MachineFunctionPass.h -#include llvm/CodeGen/MachineInstrBuilder.h -#include llvm/IR/GlobalValue.h - -namespace llvm { - -class R600LowerConstCopy : public MachineFunctionPass { -private: - static char ID; - const
[Mesa-dev] [PATCH 6/6] R600: initial scheduler code
From: Vadim Girlin vadimgir...@gmail.com This is a skeleton for a pre-RA MachineInstr scheduler strategy. Currently it only tries to expose more parallelism for ALU instructions (this also makes the distribution of GPR channels more uniform and increases the chances of ALU instructions to be packed together in a single VLIW group). Also it tries to reduce clause switching by grouping instruction of the same kind (ALU/FETCH/CF) together. Vincent Lejeune: - Support for VLIW4 Slot assignement - Recomputation of ScheduleDAG to get more parallelism opportunities Tom Stellard: - Fix assertion failure when trying to determine an instruction's slot based on its destination register's class - Fix some compiler warnings Vincent Lejeune: [v2] - Remove recomputation of ScheduleDAG (will be provided in a later patch) - Improve estimation of an ALU clause size so that heuristic does not emit cf instructions at the wrong position. - Make schedule heuristic smarter using SUnit Depth - Take constant read limitations into account --- lib/Target/R600/AMDGPUTargetMachine.cpp | 17 +- lib/Target/R600/R600MachineScheduler.cpp | 483 +++ lib/Target/R600/R600MachineScheduler.h | 121 test/CodeGen/R600/fdiv.v4f32.ll | 6 +- 4 files changed, 623 insertions(+), 4 deletions(-) create mode 100644 lib/Target/R600/R600MachineScheduler.cpp create mode 100644 lib/Target/R600/R600MachineScheduler.h diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp index 70b34b0..eb58853 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.cpp +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp @@ -17,6 +17,7 @@ #include AMDGPU.h #include R600ISelLowering.h #include R600InstrInfo.h +#include R600MachineScheduler.h #include SIISelLowering.h #include SIInstrInfo.h #include llvm/Analysis/Passes.h @@ -39,6 +40,14 @@ extern C void LLVMInitializeR600Target() { RegisterTargetMachineAMDGPUTargetMachine X(TheAMDGPUTarget); } +static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) { + return new ScheduleDAGMI(C, new R600SchedStrategy()); +} + +static MachineSchedRegistry +SchedCustomRegistry(r600, Run R600's custom scheduler, +createR600MachineScheduler); + AMDGPUTargetMachine::AMDGPUTargetMachine(const Target T, StringRef TT, StringRef CPU, StringRef FS, TargetOptions Options, @@ -70,7 +79,13 @@ namespace { class AMDGPUPassConfig : public TargetPassConfig { public: AMDGPUPassConfig(AMDGPUTargetMachine *TM, PassManagerBase PM) -: TargetPassConfig(TM, PM) {} +: TargetPassConfig(TM, PM) { +const AMDGPUSubtarget ST = TM-getSubtargetAMDGPUSubtarget(); +if (ST.device()-getGeneration() = AMDGPUDeviceInfo::HD6XXX) { + enablePass(MachineSchedulerID); + MachineSchedRegistry::setDefault(createR600MachineScheduler); +} + } AMDGPUTargetMachine getAMDGPUTargetMachine() const { return getTMAMDGPUTargetMachine(); diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp new file mode 100644 index 000..efd9490 --- /dev/null +++ b/lib/Target/R600/R600MachineScheduler.cpp @@ -0,0 +1,483 @@ +//===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===--===// +// +/// \file +/// \brief R600 Machine Scheduler interface +// TODO: Scheduling is optimised for VLIW4 arch, modify it to support TRANS slot +// +//===--===// + +#define DEBUG_TYPE misched + +#include R600MachineScheduler.h +#include llvm/CodeGen/MachineRegisterInfo.h +#include llvm/CodeGen/LiveIntervalAnalysis.h +#include llvm/Pass.h +#include llvm/PassManager.h +#include set +#include iostream +using namespace llvm; + +void R600SchedStrategy::initialize(ScheduleDAGMI *dag) { + + DAG = dag; + TII = static_castconst R600InstrInfo*(DAG-TII); + TRI = static_castconst R600RegisterInfo*(DAG-TRI); + MRI = DAG-MRI; + Available[IDAlu]-clear(); + Available[IDFetch]-clear(); + Available[IDOther]-clear(); + CurInstKind = IDOther; + CurEmitted = 0; + memset(InstructionsGroupCandidate, 0, sizeof(InstructionsGroupCandidate)); + InstKindLimit[IDAlu] = 120; // 120 minus 8 for security + + + const AMDGPUSubtarget ST = DAG-TM.getSubtargetAMDGPUSubtarget(); + if (ST.device()-getGeneration() = AMDGPUDeviceInfo::HD5XXX) { +InstKindLimit[IDFetch] = 7; // 8 minus 1 for security + } else { +InstKindLimit[IDFetch] = 15; // 16 minus 1 for security + } +} + +void R600SchedStrategy::MoveUnits(ReadyQueue *QSrc, ReadyQueue *QDst) +{ + if (QSrc-empty()) +return; + for (ReadyQueue::iterator I = QSrc-begin(), + E = QSrc-end
[Mesa-dev] Re : Re: r600g: status of my work on the shader optimization
I think the bad result of llvm can be explained because of the lack of muladd support currently. Unigine 3.0 has a lot of geometry and i suspect vertex shader being almost twice bigger than they are in tgsi case does not help. Fwiw with an hd 6950 I have the same performance in unigine 3 high, medium texture, no ssao (it seems to use indirect addressing) with llvm backend as high, high texture, no ssao with fglrx under Windows. Its not a fair comparaison but I think 3.8 kernel may provide the necessary boost to cope up with fglrx. Anyways I have some fps peak at 60fps that does not show up with tgsi backend that I also have with fglrx that makes me think llvm backend generates rather efficient code, but i always cherry pick muladd patches. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/2] R600: Increase number of ArrayBase Reg to 32
--- lib/Target/R600/R600RegisterInfo.td | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td index 3812eb7..0718854 100644 --- a/lib/Target/R600/R600RegisterInfo.td +++ b/lib/Target/R600/R600RegisterInfo.td @@ -44,7 +44,7 @@ foreach Index = 0-127 in { } // Array Base Register holding input in FS -foreach Index = 448-464 in { +foreach Index = 448-480 in { def ArrayBase#Index : R600RegARRAY_BASE, Index; } @@ -66,7 +66,7 @@ def PRED_SEL_ONE : R600RegPred_sel_one, 3; def AR_X : R600RegAR.x, 0; def R600_ArrayBase : RegisterClass AMDGPU, [f32, i32], 32, - (add (sequence ArrayBase%u, 448, 464)); + (add (sequence ArrayBase%u, 448, 480)); // special registers for ALU src operands // const buffer reference, SRCx_SEL contains index def ALU_CONST : R600RegCBuf, 0; -- 1.8.1.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/2] R600: Support for TBO
NOTE: This is a candidate for the Mesa stable branch. --- lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp | 3 +- lib/Target/R600/R600Instructions.td| 54 ++ lib/Target/R600/R600Intrinsics.td | 2 + 3 files changed, 58 insertions(+), 1 deletion(-) diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp index e061b18..7ec783f 100644 --- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp @@ -165,7 +165,8 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS, case AMDGPU::VTX_READ_GLOBAL_8_eg: case AMDGPU::VTX_READ_GLOBAL_32_eg: case AMDGPU::VTX_READ_GLOBAL_128_eg: -case AMDGPU::TEX_VTX_CONSTBUF: { +case AMDGPU::TEX_VTX_CONSTBUF: +case AMDGPU::TEX_VTX_TEXBUF : { uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups); uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index 529a4ed..e7efd0b 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -1710,6 +1710,60 @@ def TEX_VTX_CONSTBUF : // Inst{127-96} = 0; } +def TEX_VTX_TEXBUF: + InstR600ISA (outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), TEX_VTX_EXPLICIT_READ $dst, $ptr, + [(set R600_Reg128:$dst, (int_R600_load_texbuf ADDRGA_VAR_OFFSET:$ptr, imm:$BUFFER_ID))], +VTX_WORD1_GPR, VTX_WORD0 { + +let VC_INST = 0; +let FETCH_TYPE = 2; +let FETCH_WHOLE_QUAD = 0; +let SRC_REL = 0; +let SRC_SEL_X = 0; +let DST_REL = 0; +let USE_CONST_FIELDS = 1; +let NUM_FORMAT_ALL = 0; +let FORMAT_COMP_ALL = 0; +let SRF_MODE_ALL = 1; +let MEGA_FETCH_COUNT = 16; +let DST_SEL_X= 0; +let DST_SEL_Y= 1; +let DST_SEL_Z= 2; +let DST_SEL_W= 3; +let DATA_FORMAT = 0; + +let Inst{31-0} = Word0; +let Inst{63-32} = Word1; + +// LLVM can only encode 64-bit instructions, so these fields are manually +// encoded in R600CodeEmitter +// +// bits16 OFFSET; +// bits2 ENDIAN_SWAP = 0; +// bits1 CONST_BUF_NO_STRIDE = 0; +// bits1 MEGA_FETCH = 0; +// bits1 ALT_CONST = 0; +// bits2 BUFFER_INDEX_MODE = 0; + + + +// VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding +// is done in R600CodeEmitter +// +// Inst{79-64} = OFFSET; +// Inst{81-80} = ENDIAN_SWAP; +// Inst{82}= CONST_BUF_NO_STRIDE; +// Inst{83}= MEGA_FETCH; +// Inst{84}= ALT_CONST; +// Inst{86-85} = BUFFER_INDEX_MODE; +// Inst{95-86} = 0; Reserved + +// VTX_WORD3 (Padding) +// +// Inst{127-96} = 0; +} + + //======// // Instructions support diff --git a/lib/Target/R600/R600Intrinsics.td b/lib/Target/R600/R600Intrinsics.td index b5e4f1e..dc8980a 100644 --- a/lib/Target/R600/R600Intrinsics.td +++ b/lib/Target/R600/R600Intrinsics.td @@ -16,6 +16,8 @@ let TargetPrefix = R600, isTarget = 1 in { Intrinsic[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]; def int_R600_interp_input : Intrinsic[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]; + def int_R600_load_texbuf : +Intrinsic[llvm_v4f32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]; def int_R600_store_swizzle : Intrinsic[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []; def int_R600_store_stream_output : -- 1.8.1.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/4] r600g/llvm: Add support for UBO
NOTE: This is a candidate for the Mesa stable branch. --- src/gallium/drivers/r600/r600_llvm.c| 6 +- src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c | 17 + 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c index fa66fcc..7a41688 100644 --- a/src/gallium/drivers/r600/r600_llvm.c +++ b/src/gallium/drivers/r600/r600_llvm.c @@ -38,8 +38,12 @@ static LLVMValueRef llvm_fetch_const( LLVMValueRef index = LLVMBuildLoad(bld_base-base.gallivm-builder, bld-addr[reg-Indirect.Index][reg-Indirect.SwizzleX], ); offset[1] = LLVMBuildAdd(bld_base-base.gallivm-builder, offset[1], index, ); } + unsigned ConstantAddressSpace = CONSTANT_BUFFER_0_ADDR_SPACE ; + if (reg-Register.Dimension) { + ConstantAddressSpace += reg-Dimension.Index; + } LLVMTypeRef const_ptr_type = LLVMPointerType(LLVMArrayType(LLVMVectorType(bld_base-base.elem_type, 4), 1024), - CONSTANT_BUFFER_0_ADDR_SPACE); + ConstantAddressSpace); LLVMValueRef const_ptr = LLVMBuildIntToPtr(bld_base-base.gallivm-builder, lp_build_const_int32(bld_base-base.gallivm, 0), const_ptr_type, ); LLVMValueRef ptr = LLVMBuildGEP(bld_base-base.gallivm-builder, const_ptr, offset, 2, ); LLVMValueRef cvecval = LLVMBuildLoad(bld_base-base.gallivm-builder, ptr, ); diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c index 0f90991..8902ae4 100644 --- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c +++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c @@ -766,6 +766,22 @@ static void emit_icmp( emit_data-output[emit_data-chan] = v; } +static void emit_ucmp( + const struct lp_build_tgsi_action * action, + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data) +{ + unsigned pred; + LLVMBuilderRef builder = bld_base-base.gallivm-builder; + LLVMContextRef context = bld_base-base.gallivm-context; + + + LLVMValueRef v = LLVMBuildFCmp(builder, LLVMRealUGE, + emit_data-args[0], lp_build_const_float(bld_base-base.gallivm, 0.), ); + + emit_data-output[emit_data-chan] = LLVMBuildSelect(builder, v, emit_data-args[2], emit_data-args[1], ); +} + static void emit_cmp( const struct lp_build_tgsi_action *action, struct lp_build_tgsi_context * bld_base, @@ -1241,6 +1257,7 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx) bld_base-op_actions[TGSI_OPCODE_USNE].emit = emit_icmp; bld_base-op_actions[TGSI_OPCODE_U2F].emit = emit_u2f; bld_base-op_actions[TGSI_OPCODE_XOR].emit = emit_xor; + bld_base-op_actions[TGSI_OPCODE_UCMP].emit = emit_ucmp; bld_base-rsq_action.emit = build_tgsi_intrinsic_nomem; bld_base-rsq_action.intr_name = llvm.AMDGPU.rsq; -- 1.8.1.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/4] r600g/llvm: Fix alpha_to_one piglit tests
--- src/gallium/drivers/r600/r600_llvm.c | 2 ++ src/gallium/drivers/r600/r600_shader.c | 1 + src/gallium/drivers/radeon/radeon_llvm.h | 1 + 3 files changed, 4 insertions(+) diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c index 7a41688..59047e7 100644 --- a/src/gallium/drivers/r600/r600_llvm.c +++ b/src/gallium/drivers/r600/r600_llvm.c @@ -234,6 +234,8 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base) elements[chan] = LLVMBuildLoad(base-gallivm-builder, ctx-soa.outputs[i][chan], ); } + if (ctx-alpha_to_one ctx-type == TGSI_PROCESSOR_FRAGMENT ctx-r600_outputs[i].name == TGSI_SEMANTIC_COLOR) + elements[3] = lp_build_const_float(base-gallivm, 1.0f); LLVMValueRef output = lp_build_gather_values(base-gallivm, elements, 4); if (ctx-type == TGSI_PROCESSOR_VERTEX) { diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 59a7f92..8642463 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -1428,6 +1428,7 @@ static int r600_shader_from_tgsi(struct r600_screen *rscreen, radeon_llvm_ctx.fs_color_all = shader-fs_write_all (rscreen-chip_class = EVERGREEN); radeon_llvm_ctx.stream_outputs = so; radeon_llvm_ctx.clip_vertex = ctx.cv_output; + radeon_llvm_ctx.alpha_to_one = key.alpha_to_one; mod = r600_tgsi_llvm(radeon_llvm_ctx, tokens); if (debug_get_bool_option(R600_DUMP_SHADERS, FALSE)) { dump = 1; diff --git a/src/gallium/drivers/radeon/radeon_llvm.h b/src/gallium/drivers/radeon/radeon_llvm.h index 21360e2..bfeacb5 100644 --- a/src/gallium/drivers/radeon/radeon_llvm.h +++ b/src/gallium/drivers/radeon/radeon_llvm.h @@ -64,6 +64,7 @@ struct radeon_llvm_context { struct pipe_stream_output_info *stream_outputs; unsigned color_buffer_count; unsigned fs_color_all; + unsigned alpha_to_one; /*=== Front end configuration ===*/ -- 1.8.1.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 3/4] r600g/llvm: Set Inputs/Outputs count to 32 (api reported value)
--- src/gallium/drivers/radeon/radeon_llvm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/radeon/radeon_llvm.h b/src/gallium/drivers/radeon/radeon_llvm.h index bfeacb5..b1e025b 100644 --- a/src/gallium/drivers/radeon/radeon_llvm.h +++ b/src/gallium/drivers/radeon/radeon_llvm.h @@ -31,8 +31,8 @@ #include gallivm/lp_bld_init.h #include gallivm/lp_bld_tgsi.h -#define RADEON_LLVM_MAX_INPUTS 16 * 4 -#define RADEON_LLVM_MAX_OUTPUTS 16 * 4 +#define RADEON_LLVM_MAX_INPUTS 32 * 4 +#define RADEON_LLVM_MAX_OUTPUTS 32 * 4 #define RADEON_LLVM_MAX_BRANCH_DEPTH 16 #define RADEON_LLVM_MAX_LOOP_DEPTH 16 -- 1.8.1.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 4/4] r600g/llvm: Support for TBO
--- src/gallium/drivers/r600/r600_llvm.c | 29 + 1 file changed, 29 insertions(+) diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c index 59047e7..89bcb79 100644 --- a/src/gallium/drivers/r600/r600_llvm.c +++ b/src/gallium/drivers/r600/r600_llvm.c @@ -412,6 +412,35 @@ static void llvm_emit_tex( LLVMValueRef args[6]; unsigned c, sampler_src; + if (emit_data-inst-Texture.Texture == TGSI_TEXTURE_BUFFER) { + switch (emit_data-inst-Instruction.Opcode) { + case TGSI_OPCODE_TXQ: { + LLVMValueRef offset[2] = { + LLVMConstInt(LLVMInt64TypeInContext(bld_base-base.gallivm-context), 0, false), + lp_build_const_int32(bld_base-base.gallivm, 1) + }; + LLVMTypeRef const_ptr_type = LLVMPointerType(LLVMArrayType(LLVMVectorType(bld_base-base.elem_type, 4), 1024), + R600_BUFFER_INFO_CONST_BUFFER); + LLVMValueRef const_ptr = LLVMBuildIntToPtr(bld_base-base.gallivm-builder, lp_build_const_int32(bld_base-base.gallivm, 0), const_ptr_type, ); + LLVMValueRef ptr = LLVMBuildGEP(bld_base-base.gallivm-builder, const_ptr, offset, 2, ); + LLVMValueRef cvecval = LLVMBuildLoad(bld_base-base.gallivm-builder, ptr, ); + emit_data-output[0] = cvecval; + break; + } + case TGSI_OPCODE_TXF: { + args[0] = LLVMBuildExtractElement(gallivm-builder, emit_data-args[0], lp_build_const_int32(gallivm, 0), ); + args[1] = lp_build_const_int32(gallivm, R600_MAX_CONST_BUFFERS); + emit_data-output[0] = build_intrinsic(gallivm-builder, + llvm.R600.load.texbuf, + emit_data-dst_type, args, 2, LLVMReadNoneAttribute); + } + break; + default: + assert(0 Unknow Texture Buffer Instruction !); + } + return; + } + assert(emit_data-arg_count + 2 = Elements(args)); for (c = 0; c emit_data-arg_count; ++c) -- 1.8.1.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] r600g: Report Instructions Group count with R600_DUMP_SHADERS=1
--- src/gallium/drivers/r600/r600_asm.c | 7 ++- src/gallium/drivers/r600/r600_asm.h | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c index 3632aa5..eacdb0c 100644 --- a/src/gallium/drivers/r600/r600_asm.c +++ b/src/gallium/drivers/r600/r600_asm.c @@ -812,6 +812,8 @@ static int merge_inst_groups(struct r600_bytecode *bc, struct r600_bytecode_alu /* looks like everything worked out right, apply the changes */ + bc-nig --; + /* undo adding previus literals */ bc-cf_last-ndw -= align(prev_nliteral, 2); @@ -1140,6 +1142,9 @@ int r600_bytecode_add_alu_type(struct r600_bytecode *bc, if (nalu-dst.sel = bc-ngpr) { bc-ngpr = nalu-dst.sel + 1; } + if (nalu-last) { + bc-nig ++; + } LIST_ADDTAIL(nalu-list, bc-cf_last-alu); /* each alu use 2 dwords */ bc-cf_last-ndw += 2; @@ -2105,7 +2110,7 @@ void r600_bytecode_dump(struct r600_bytecode *bc) chip = '6'; break; } - fprintf(stderr, bytecode %d dw -- %d gprs -\n, bc-ndw, bc-ngpr); + fprintf(stderr, bytecode %d dw -- %d gprs -- %d ig-\n, bc-ndw, bc-ngpr, bc-nig); fprintf(stderr, %c\n, chip); LIST_FOR_EACH_ENTRY(cf, bc-cf, list) { diff --git a/src/gallium/drivers/r600/r600_asm.h b/src/gallium/drivers/r600/r600_asm.h index 03cd238..1638ca0 100644 --- a/src/gallium/drivers/r600/r600_asm.h +++ b/src/gallium/drivers/r600/r600_asm.h @@ -192,6 +192,7 @@ struct r600_bytecode { struct r600_bytecode_cf *cf_last; unsignedndw; unsignedncf; + unsignednig; // Number of Instructions Group unsignedngpr; unsignednstack; unsignednresource; -- 1.8.1.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] R600/SI: Do not fold single instruction with more that 3 kcache read
It fixes around 100 tfb piglit tests and 16 glean tests. NOTE: This is a candidate for the Mesa stable branch. --- lib/Target/R600/AMDILISelDAGToDAG.cpp | 2 ++ lib/Target/R600/R600LowerConstCopy.cpp | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp b/lib/Target/R600/AMDILISelDAGToDAG.cpp index 2f34fe3..858eb5d 100644 --- a/lib/Target/R600/AMDILISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp @@ -358,6 +358,8 @@ bool AMDGPUDAGToDAGISel::FoldOperands(unsigned Opcode, SDValue Operand = Ops[OperandIdx[i] - 1]; switch (Operand.getOpcode()) { case AMDGPUISD::CONST_ADDRESS: { + if (i == 2) +break; SDValue CstOffset; if (!Operand.getValueType().isVector() SelectGlobalValueConstantOffset(Operand.getOperand(0), CstOffset)) { diff --git a/lib/Target/R600/R600LowerConstCopy.cpp b/lib/Target/R600/R600LowerConstCopy.cpp index 2557e8f..c8c27a8 100644 --- a/lib/Target/R600/R600LowerConstCopy.cpp +++ b/lib/Target/R600/R600LowerConstCopy.cpp @@ -180,7 +180,7 @@ bool R600LowerConstCopy::runOnMachineFunction(MachineFunction MF) { int ConstMovSel = TII-getOperandIdx(CstMov-getOpcode(), R600Operands::SRC0_SEL); unsigned ConstIndex = CstMov-getOperand(ConstMovSel).getImm(); -if (canFoldInBundle(CP, ConstIndex)) { +if (MI-isInsideBundle() canFoldInBundle(CP, ConstIndex)) { TII-setImmOperand(MI, OpTable[SrcOp][1], ConstIndex); MI-getOperand(SrcIdx).setReg(AMDGPU::ALU_CONST); } else { -- 1.8.1.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] R600/SI: Add support for indirect addressing of non default const buffer
NOTE: This is a candidate for the Mesa stable branch. --- lib/Target/R600/R600ISelLowering.cpp | 6 -- lib/Target/R600/R600Instructions.td | 9 - 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index 21d301c..c4cb870 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -911,7 +911,8 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG DAG) const if (ConstantBlock -1) { SDValue Result; if (dyn_castConstantExpr(LoadNode-getSrcValue()) || -dyn_castConstant(LoadNode-getSrcValue())) { +dyn_castConstant(LoadNode-getSrcValue()) || +dyn_castConstantSDNode(Ptr)) { SDValue Slots[4]; for (unsigned i = 0; i 4; i++) { // We want Const position encoded with the following formula : @@ -927,7 +928,8 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG DAG) const } else { // non constant ptr cant be folded, keeps it as a v4f32 load Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, - DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)) + DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)), + DAG.getConstant(LoadNode-getAddressSpace() - 9, MVT::i32) ); } diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index 50ff6aa..529a4ed 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -512,8 +512,8 @@ def INTERP_PAIR_ZW : AMDGPUShaderInst []; def CONST_ADDRESS: SDNodeAMDGPUISD::CONST_ADDRESS, - SDTypeProfile1, 1, [SDTCisInt0, SDTCisPtrTy1], - [SDNPMayLoad] + SDTypeProfile1, -1, [SDTCisInt0, SDTCisPtrTy1], + [SDNPMayLoad, SDNPVariadic] ; //===--===// @@ -1658,14 +1658,13 @@ def CONST_COPY : Instruction { } // end isCodeGenOnly = 1, isPseudo = 1, Namespace = AMDGPU def TEX_VTX_CONSTBUF : - InstR600ISA (outs R600_Reg128:$dst), (ins MEMxi:$ptr), VTX_READ_eg $dst, $ptr, - [(set R600_Reg128:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr))], + InstR600ISA (outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), VTX_READ_eg $dst, $ptr, + [(set R600_Reg128:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr, (i32 imm:$BUFFER_ID)))], VTX_WORD1_GPR, VTX_WORD0 { let VC_INST = 0; let FETCH_TYPE = 2; let FETCH_WHOLE_QUAD = 0; - let BUFFER_ID = 0; let SRC_REL = 0; let SRC_SEL_X = 0; let DST_REL = 0; -- 1.8.1.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] r600g/llvm: Add support for UBO
NOTE: This is a candidate for the Mesa stable branch. --- src/gallium/drivers/r600/r600_llvm.c| 10 +++--- src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c | 17 + src/gallium/winsys/radeon/drm/radeon_drm_winsys.c | 4 ++-- 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c index fa66fcc..e8b4679 100644 --- a/src/gallium/drivers/r600/r600_llvm.c +++ b/src/gallium/drivers/r600/r600_llvm.c @@ -18,7 +18,7 @@ #include stdio.h -#if defined R600_USE_LLVM || defined HAVE_OPENCL +//#if defined R600_USE_LLVM || defined HAVE_OPENCL #define CONSTANT_BUFFER_0_ADDR_SPACE 9 #define CONSTANT_BUFFER_1_ADDR_SPACE (CONSTANT_BUFFER_0_ADDR_SPACE + R600_UCP_CONST_BUFFER) @@ -38,8 +38,12 @@ static LLVMValueRef llvm_fetch_const( LLVMValueRef index = LLVMBuildLoad(bld_base-base.gallivm-builder, bld-addr[reg-Indirect.Index][reg-Indirect.SwizzleX], ); offset[1] = LLVMBuildAdd(bld_base-base.gallivm-builder, offset[1], index, ); } + unsigned ConstantAddressSpace = CONSTANT_BUFFER_0_ADDR_SPACE ; + if (reg-Register.Dimension) { + ConstantAddressSpace += reg-Dimension.Index; + } LLVMTypeRef const_ptr_type = LLVMPointerType(LLVMArrayType(LLVMVectorType(bld_base-base.elem_type, 4), 1024), - CONSTANT_BUFFER_0_ADDR_SPACE); + ConstantAddressSpace); LLVMValueRef const_ptr = LLVMBuildIntToPtr(bld_base-base.gallivm-builder, lp_build_const_int32(bld_base-base.gallivm, 0), const_ptr_type, ); LLVMValueRef ptr = LLVMBuildGEP(bld_base-base.gallivm-builder, const_ptr, offset, 2, ); LLVMValueRef cvecval = LLVMBuildLoad(bld_base-base.gallivm-builder, ptr, ); @@ -602,4 +606,4 @@ unsigned r600_llvm_compile( gpu_family, dump); } -#endif +//#endif diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c index 0f90991..8902ae4 100644 --- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c +++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c @@ -766,6 +766,22 @@ static void emit_icmp( emit_data-output[emit_data-chan] = v; } +static void emit_ucmp( + const struct lp_build_tgsi_action * action, + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data) +{ + unsigned pred; + LLVMBuilderRef builder = bld_base-base.gallivm-builder; + LLVMContextRef context = bld_base-base.gallivm-context; + + + LLVMValueRef v = LLVMBuildFCmp(builder, LLVMRealUGE, + emit_data-args[0], lp_build_const_float(bld_base-base.gallivm, 0.), ); + + emit_data-output[emit_data-chan] = LLVMBuildSelect(builder, v, emit_data-args[2], emit_data-args[1], ); +} + static void emit_cmp( const struct lp_build_tgsi_action *action, struct lp_build_tgsi_context * bld_base, @@ -1241,6 +1257,7 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx) bld_base-op_actions[TGSI_OPCODE_USNE].emit = emit_icmp; bld_base-op_actions[TGSI_OPCODE_U2F].emit = emit_u2f; bld_base-op_actions[TGSI_OPCODE_XOR].emit = emit_xor; + bld_base-op_actions[TGSI_OPCODE_UCMP].emit = emit_ucmp; bld_base-rsq_action.emit = build_tgsi_intrinsic_nomem; bld_base-rsq_action.intr_name = llvm.AMDGPU.rsq; diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c index 62ba4b1..bbfe664 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c @@ -372,7 +372,7 @@ static boolean do_winsys_init(struct radeon_drm_winsys *ws) } ws-info.r600_virtual_address = FALSE; -if (ws-info.drm_minor = 13) { +/*if (ws-info.drm_minor = 13) { ws-info.r600_virtual_address = TRUE; if (!radeon_get_drm_value(ws-fd, RADEON_INFO_VA_START, NULL, ws-info.r600_va_start)) @@ -380,7 +380,7 @@ static boolean do_winsys_init(struct radeon_drm_winsys *ws) if (!radeon_get_drm_value(ws-fd, RADEON_INFO_IB_VM_MAX_SIZE, NULL, ws-info.r600_ib_vm_max_size)) ws-info.r600_virtual_address = FALSE; -} +}*/ } /* Get max pipes, this is only needed for compute shaders. All evergreen+ -- 1.8.1.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] R600: Do not fold single instruction with more that 3 kcache read
It fixes around 100 tfb piglit tests and 16 glean tests. NOTE: This is a candidate for the Mesa stable branch. --- lib/Target/R600/AMDILISelDAGToDAG.cpp | 2 ++ lib/Target/R600/R600LowerConstCopy.cpp | 2 +- test/CodeGen/R600/kcache-fold.ll | 52 ++ 3 files changed, 55 insertions(+), 1 deletion(-) create mode 100644 test/CodeGen/R600/kcache-fold.ll diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp b/lib/Target/R600/AMDILISelDAGToDAG.cpp index 2f34fe3..858eb5d 100644 --- a/lib/Target/R600/AMDILISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp @@ -358,6 +358,8 @@ bool AMDGPUDAGToDAGISel::FoldOperands(unsigned Opcode, SDValue Operand = Ops[OperandIdx[i] - 1]; switch (Operand.getOpcode()) { case AMDGPUISD::CONST_ADDRESS: { + if (i == 2) +break; SDValue CstOffset; if (!Operand.getValueType().isVector() SelectGlobalValueConstantOffset(Operand.getOperand(0), CstOffset)) { diff --git a/lib/Target/R600/R600LowerConstCopy.cpp b/lib/Target/R600/R600LowerConstCopy.cpp index 2557e8f..c8c27a8 100644 --- a/lib/Target/R600/R600LowerConstCopy.cpp +++ b/lib/Target/R600/R600LowerConstCopy.cpp @@ -180,7 +180,7 @@ bool R600LowerConstCopy::runOnMachineFunction(MachineFunction MF) { int ConstMovSel = TII-getOperandIdx(CstMov-getOpcode(), R600Operands::SRC0_SEL); unsigned ConstIndex = CstMov-getOperand(ConstMovSel).getImm(); -if (canFoldInBundle(CP, ConstIndex)) { +if (MI-isInsideBundle() canFoldInBundle(CP, ConstIndex)) { TII-setImmOperand(MI, OpTable[SrcOp][1], ConstIndex); MI-getOperand(SrcIdx).setReg(AMDGPU::ALU_CONST); } else { diff --git a/test/CodeGen/R600/kcache-fold.ll b/test/CodeGen/R600/kcache-fold.ll new file mode 100644 index 000..382f78c --- /dev/null +++ b/test/CodeGen/R600/kcache-fold.ll @@ -0,0 +1,52 @@ +;RUN: llc %s -march=r600 -mcpu=redwood | FileCheck %s + +; CHECK: MOV T{{[0-9]+\.[XYZW], CBuf0\[[0-9]+\]\.[XYZW]}} + +define void @main() { +main_body: + %0 = load 4 x float addrspace(9)* null + %1 = extractelement 4 x float %0, i32 0 + %2 = load 4 x float addrspace(9)* getelementptr ([1024 x 4 x float] addrspace(9)* null, i64 0, i32 1) + %3 = extractelement 4 x float %2, i32 0 + %4 = load 4 x float addrspace(9)* getelementptr ([1024 x 4 x float] addrspace(9)* null, i64 0, i32 2) + %5 = extractelement 4 x float %4, i32 0 + %6 = fcmp ult float %1, 0.00e+00 + %7 = select i1 %6, float %3, float %5 + %8 = load 4 x float addrspace(9)* null + %9 = extractelement 4 x float %8, i32 1 + %10 = load 4 x float addrspace(9)* getelementptr ([1024 x 4 x float] addrspace(9)* null, i64 0, i32 1) + %11 = extractelement 4 x float %10, i32 1 + %12 = load 4 x float addrspace(9)* getelementptr ([1024 x 4 x float] addrspace(9)* null, i64 0, i32 2) + %13 = extractelement 4 x float %12, i32 1 + %14 = fcmp ult float %9, 0.00e+00 + %15 = select i1 %14, float %11, float %13 + %16 = load 4 x float addrspace(9)* null + %17 = extractelement 4 x float %16, i32 2 + %18 = load 4 x float addrspace(9)* getelementptr ([1024 x 4 x float] addrspace(9)* null, i64 0, i32 1) + %19 = extractelement 4 x float %18, i32 2 + %20 = load 4 x float addrspace(9)* getelementptr ([1024 x 4 x float] addrspace(9)* null, i64 0, i32 2) + %21 = extractelement 4 x float %20, i32 2 + %22 = fcmp ult float %17, 0.00e+00 + %23 = select i1 %22, float %19, float %21 + %24 = load 4 x float addrspace(9)* null + %25 = extractelement 4 x float %24, i32 3 + %26 = load 4 x float addrspace(9)* getelementptr ([1024 x 4 x float] addrspace(9)* null, i64 0, i32 1) + %27 = extractelement 4 x float %26, i32 3 + %28 = load 4 x float addrspace(9)* getelementptr ([1024 x 4 x float] addrspace(9)* null, i64 0, i32 2) + %29 = extractelement 4 x float %28, i32 3 + %30 = fcmp ult float %25, 0.00e+00 + %31 = select i1 %30, float %27, float %29 + %32 = call float @llvm.AMDIL.clamp.(float %7, float 0.00e+00, float 1.00e+00) + %33 = call float @llvm.AMDIL.clamp.(float %15, float 0.00e+00, float 1.00e+00) + %34 = call float @llvm.AMDIL.clamp.(float %23, float 0.00e+00, float 1.00e+00) + %35 = call float @llvm.AMDIL.clamp.(float %31, float 0.00e+00, float 1.00e+00) + %36 = insertelement 4 x float undef, float %32, i32 0 + %37 = insertelement 4 x float %36, float %33, i32 1 + %38 = insertelement 4 x float %37, float %34, i32 2 + %39 = insertelement 4 x float %38, float %35, i32 3 + call void @llvm.R600.store.swizzle(4 x float %39, i32 0, i32 0) + ret void +} + +declare float @llvm.AMDIL.clamp.(float, float, float) readnone +declare void @llvm.R600.store.swizzle(4 x float, i32, i32) -- 1.8.1.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/2] R600/SI: Turn BUILD_VECTOR into Reg_Sequence
--- lib/Target/R600/AMDILISelDAGToDAG.cpp | 24 1 file changed, 24 insertions(+) diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp b/lib/Target/R600/AMDILISelDAGToDAG.cpp index b125ba8..2f34fe3 100644 --- a/lib/Target/R600/AMDILISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp @@ -160,6 +160,30 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { } switch (Opc) { default: break; + case ISD::BUILD_VECTOR: { +// BUILD_VECTOR is usually lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG +// that adds a 128 bits reg copy when going through TwoAddressInstructions +// pass. We want to avoid 128 bits copies as much as possible because they +// can't be bundled by our scheduler. +SDValue RegSeqArgs[9] = { + CurDAG-getTargetConstant(AMDGPU::R600_Reg128RegClassID, MVT::i32), + SDValue(), CurDAG-getTargetConstant(AMDGPU::sub0, MVT::i32), + SDValue(), CurDAG-getTargetConstant(AMDGPU::sub1, MVT::i32), + SDValue(), CurDAG-getTargetConstant(AMDGPU::sub2, MVT::i32), + SDValue(), CurDAG-getTargetConstant(AMDGPU::sub3, MVT::i32) +}; +bool IsRegSeq = true; +for (unsigned i = 0; i N-getNumOperands(); i++) { + if (dyn_castRegisterSDNode(N-getOperand(i))) { +IsRegSeq = false; +break; + } + RegSeqArgs[2 * i + 1] = N-getOperand(i); +} +if (!IsRegSeq) + break; +return CurDAG-SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N-getVTList(), RegSeqArgs, 2 * N-getNumOperands() + 1); + } case ISD::ConstantFP: case ISD::Constant: { const AMDGPUSubtarget ST = TM.getSubtargetAMDGPUSubtarget(); -- 1.8.1.2 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/2] R600: initial scheduler code
From: Vadim Girlin vadimgir...@gmail.com This is a skeleton for a pre-RA MachineInstr scheduler strategy. Currently it only tries to expose more parallelism for ALU instructions (this also makes the distribution of GPR channels more uniform and increases the chances of ALU instructions to be packed together in a single VLIW group). Also it tries to reduce clause switching by grouping instruction of the same kind (ALU/FETCH/CF) together. Vincent Lejeune: - Support for VLIW4 Slot assignement - Recomputation of ScheduleDAG to get more parallelism opportunities --- lib/Target/R600/AMDGPUTargetMachine.cpp | 17 +- lib/Target/R600/R600MachineScheduler.cpp | 452 +++ lib/Target/R600/R600MachineScheduler.h | 119 3 files changed, 587 insertions(+), 1 deletion(-) create mode 100644 lib/Target/R600/R600MachineScheduler.cpp create mode 100644 lib/Target/R600/R600MachineScheduler.h diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp index 821e864..e6070cd 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.cpp +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp @@ -17,6 +17,7 @@ #include AMDGPU.h #include R600ISelLowering.h #include R600InstrInfo.h +#include R600MachineScheduler.h #include SIISelLowering.h #include SIInstrInfo.h #include llvm/Analysis/Passes.h @@ -39,6 +40,14 @@ extern C void LLVMInitializeR600Target() { RegisterTargetMachineAMDGPUTargetMachine X(TheAMDGPUTarget); } +static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) { + return new ScheduleDAGMI(C, new R600SchedStrategy()); +} + +static MachineSchedRegistry +SchedCustomRegistry(r600, Run R600's custom scheduler, +createR600MachineScheduler); + AMDGPUTargetMachine::AMDGPUTargetMachine(const Target T, StringRef TT, StringRef CPU, StringRef FS, TargetOptions Options, @@ -70,7 +79,13 @@ namespace { class AMDGPUPassConfig : public TargetPassConfig { public: AMDGPUPassConfig(AMDGPUTargetMachine *TM, PassManagerBase PM) -: TargetPassConfig(TM, PM) {} +: TargetPassConfig(TM, PM) { +const AMDGPUSubtarget ST = TM-getSubtargetAMDGPUSubtarget(); +if (ST.device()-getGeneration() = AMDGPUDeviceInfo::HD6XXX) { + enablePass(MachineSchedulerID); + MachineSchedRegistry::setDefault(createR600MachineScheduler); +} + } AMDGPUTargetMachine getAMDGPUTargetMachine() const { return getTMAMDGPUTargetMachine(); diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp new file mode 100644 index 000..229374c --- /dev/null +++ b/lib/Target/R600/R600MachineScheduler.cpp @@ -0,0 +1,452 @@ +//===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===--===// +// +/// \file +/// \brief R600 Machine Scheduler interface +// TODO: Scheduling is optimised for VLIW4 arch, modify it to support TRANS slot +// +//===--===// + +#define DEBUG_TYPE misched + +#include R600MachineScheduler.h +#include llvm/CodeGen/MachineRegisterInfo.h +#include llvm/CodeGen/LiveIntervalAnalysis.h +#include llvm/Pass.h +#include llvm/PassManager.h +#include set +#include iostream +using namespace llvm; + +/// \brief Recompute Output and Anti dependencies of incoming dag +/// ScheduleDAGInstrs has a conservative policy about subregisters dependencies. +/// All subreg write of a same superreg will be chained by Output/Anti deps. +/// These artificial deps delay releases of MI and thus reduce parallelism +/// oportunities. This function recompute the ScheduleDag to produce proper +/// subreg aware dependencies. +// Todo : It should also recompute Data dependencies +static +void RecomputeScheduleDAGMI(ScheduleDAGMI *dag) { + + // Remove all Output/Anti deps + for (unsigned i = 0; i dag-SUnits.size(); ++i) { +SUnit SU = dag-SUnits[i]; +for (SUnit::pred_iterator SUIt = SU.Preds.begin(), SUE = SU.Preds.end(); +SUIt != SUE; ++SUIt) { + SDep SD = *SUIt; + SUnit *SUPred = SD.getSUnit(); + if (SD.getKind() == SDep::Output) { +SUPred-removePred(SD); + } +} + } + +// Now recompute output/anti dependencies + for (unsigned i = 0; i dag-SUnits.size(); ++i) { +SUnit SU = dag-SUnits[i]; +MachineOperand DestMO = SU.getInstr()-getOperand(0); +unsigned DestReg = SU.getInstr()-getOperand(0).getReg(); +DEBUG(dbgs() Recomputing deps for ; SU.dump(dag); dbgs() \n;); +// Using LiveInterval should make things a lot more efficient, but we +// can't access them inside a MachineSchedStrategy. +// Scheduling occurs on a per MBB basis, so it is sufficient to get deps
Re: [Mesa-dev] [PATCH] R600: Fix regression with shadow array sampler on pre-SI GPUs.
- Mail original - De : Michel Dänzer mic...@daenzer.net À : Vincent Lejeune v...@ovi.com Cc : mesa-dev@lists.freedesktop.org Envoyé le : Lundi 11 février 2013 17h53 Objet : [PATCH] R600: Fix regression with shadow array sampler on pre-SI GPUs. From: Michel Dänzer michel.daen...@amd.com 'R600/SI: Use proper instructions for array/shadow samplers.' removed two cases from TEX_SHADOW. Vincent Lejeune reported on IRC that this broke some shadow array piglit tests with the r600g driver. Reinstating the removed cases should fix this, and still works with radeonsi as well. Signed-off-by: Michel Dänzer michel.daen...@amd.com --- Vincent, can you confirm this fixes the regression? It does, thank ! lib/Target/R600/R600Instructions.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index d307ed2..1069570 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -399,7 +399,7 @@ class R600_TEX bits11 inst, string opName, listdag pattern, def TEX_SHADOW : PatLeaf (imm), [{uint32_t TType = (uint32_t)N-getZExtValue(); - return (TType = 6 TType = 8) || TType == 13; + return (TType = 6 TType = 8) || (TType = 11 TType = 13); }] ; -- 1.8.1.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] R600/SI: Use MULADD_IEEE/V_MAD_F32 instruction for mad pattern
--- lib/Target/R600/AMDGPUISelLowering.cpp | 10 +++--- lib/Target/R600/AMDGPUISelLowering.h | 1 - lib/Target/R600/AMDILISelLowering.cpp | 3 ++- lib/Target/R600/AMDILInstrInfo.td | 1 - lib/Target/R600/AMDILIntrinsics.td | 10 -- lib/Target/R600/R600Instructions.td| 9 - lib/Target/R600/SIInstructions.td | 4 ++-- test/CodeGen/R600/fmad.ll | 19 +++ 8 files changed, 34 insertions(+), 23 deletions(-) create mode 100644 test/CodeGen/R600/fmad.ll diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index d0d23d6..0a33264 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -127,9 +127,6 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return LowerIntrinsicLRP(Op, DAG); case AMDGPUIntrinsic::AMDIL_fraction: return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); -case AMDGPUIntrinsic::AMDIL_mad: - return DAG.getNode(AMDGPUISD::MAD, DL, VT, Op.getOperand(1), - Op.getOperand(2), Op.getOperand(3)); case AMDGPUIntrinsic::AMDIL_max: return DAG.getNode(AMDGPUISD::FMAX, DL, VT, Op.getOperand(1), Op.getOperand(2)); @@ -176,9 +173,9 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op, Op.getOperand(1)); SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA, Op.getOperand(3)); - return DAG.getNode(AMDGPUISD::MAD, DL, VT, Op.getOperand(1), - Op.getOperand(2), - OneSubAC); + return DAG.getNode(ISD::FADD, DL, VT, + DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), Op.getOperand(2)), + OneSubAC); } /// \brief Generate Min/Max node @@ -393,7 +390,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { switch (Opcode) { default: return 0; // AMDIL DAG nodes - NODE_NAME_CASE(MAD); NODE_NAME_CASE(CALL); NODE_NAME_CASE(UMUL); NODE_NAME_CASE(DIV_INF); diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index 4b844a3..f27b5db 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -108,7 +108,6 @@ namespace AMDGPUISD { enum { // AMDIL ISD Opcodes FIRST_NUMBER = ISD::BUILTIN_OP_END, - MAD, // 32bit Fused Multiply Add instruction CALL,// Function call based on a single integer UMUL,// 32bit unsigned multiplication DIV_INF, // Divide with infinity returned on zero divisor diff --git a/lib/Target/R600/AMDILISelLowering.cpp b/lib/Target/R600/AMDILISelLowering.cpp index 2e60adc..3480ac8 100644 --- a/lib/Target/R600/AMDILISelLowering.cpp +++ b/lib/Target/R600/AMDILISelLowering.cpp @@ -451,7 +451,8 @@ AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG DAG) const { SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq); // float fr = mad(fqneg, fb, fa); - SDValue fr = DAG.getNode(AMDGPUISD::MAD, DL, FLTTY, fqneg, fb, fa); + SDValue fr = DAG.getNode(ISD::FADD, DL, FLTTY, + DAG.getNode(ISD::MUL, DL, FLTTY, fqneg, fb), fa); // int iq = (int)fq; SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq); diff --git a/lib/Target/R600/AMDILInstrInfo.td b/lib/Target/R600/AMDILInstrInfo.td index e969bbf..110f147 100644 --- a/lib/Target/R600/AMDILInstrInfo.td +++ b/lib/Target/R600/AMDILInstrInfo.td @@ -116,7 +116,6 @@ def IL_retflag : SDNodeAMDGPUISD::RET_FLAG, SDTNone, //======// // Floating point math functions def IL_div_inf : SDNodeAMDGPUISD::DIV_INF, SDTIL_GenBinaryOp; -def IL_mad : SDNodeAMDGPUISD::MAD, SDTIL_GenTernaryOp; //===--===// // Integer functions diff --git a/lib/Target/R600/AMDILIntrinsics.td b/lib/Target/R600/AMDILIntrinsics.td index 3f9e20f..6ec3559 100644 --- a/lib/Target/R600/AMDILIntrinsics.td +++ b/lib/Target/R600/AMDILIntrinsics.td @@ -92,12 +92,6 @@ let TargetPrefix = AMDIL, isTarget = 1 in { TernaryIntInt; def int_AMDIL_bfm : GCCBuiltin__amdil_bfm, BinaryIntInt; - def int_AMDIL_mad_i32 : GCCBuiltin__amdil_imad, - TernaryIntInt; - def int_AMDIL_mad_u32 : GCCBuiltin__amdil_umad, - TernaryIntInt; - def int_AMDIL_mad : GCCBuiltin__amdil_mad, - TernaryIntFloat; def int_AMDIL_mulhi_i32 : GCCBuiltin__amdil_imul_high, BinaryIntInt; def int_AMDIL_mulhi_u32 : GCCBuiltin__amdil_umul_high, @@ -110,10 +104,6 @@ let TargetPrefix = AMDIL, isTarget = 1 in { BinaryIntInt; def int_AMDIL_mulhi24_u32 : GCCBuiltin__amdil_umul24_high, BinaryIntInt; - def int_AMDIL_mad24_i32
[Mesa-dev] [PATCH] R600: Use MULADD_IEEE instruction for mad pattern
--- lib/Target/R600/AMDGPUISelLowering.cpp | 10 +++--- lib/Target/R600/AMDGPUISelLowering.h | 1 - lib/Target/R600/AMDILISelLowering.cpp | 3 ++- lib/Target/R600/AMDILInstrInfo.td | 1 - lib/Target/R600/AMDILIntrinsics.td | 10 -- lib/Target/R600/R600Instructions.td| 9 - lib/Target/R600/SIInstructions.td | 2 +- test/CodeGen/R600/fmad.ll | 19 +++ 8 files changed, 33 insertions(+), 22 deletions(-) create mode 100644 test/CodeGen/R600/fmad.ll diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index f3a047a..530da5a 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -127,9 +127,6 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return LowerIntrinsicLRP(Op, DAG); case AMDGPUIntrinsic::AMDIL_fraction: return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); -case AMDGPUIntrinsic::AMDIL_mad: - return DAG.getNode(AMDGPUISD::MAD, DL, VT, Op.getOperand(1), - Op.getOperand(2), Op.getOperand(3)); case AMDGPUIntrinsic::AMDIL_max: return DAG.getNode(AMDGPUISD::FMAX, DL, VT, Op.getOperand(1), Op.getOperand(2)); @@ -176,9 +173,9 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op, Op.getOperand(1)); SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA, Op.getOperand(3)); - return DAG.getNode(AMDGPUISD::MAD, DL, VT, Op.getOperand(1), - Op.getOperand(2), - OneSubAC); + return DAG.getNode(ISD::FADD, DL, VT, + DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), Op.getOperand(2)), + OneSubAC); } /// \brief Generate Min/Max node @@ -393,7 +390,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { switch (Opcode) { default: return 0; // AMDIL DAG nodes - NODE_NAME_CASE(MAD); NODE_NAME_CASE(CALL); NODE_NAME_CASE(UMUL); NODE_NAME_CASE(DIV_INF); diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index 0584d39..e4d77e3 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -103,7 +103,6 @@ namespace AMDGPUISD { enum { // AMDIL ISD Opcodes FIRST_NUMBER = ISD::BUILTIN_OP_END, - MAD, // 32bit Fused Multiply Add instruction CALL,// Function call based on a single integer UMUL,// 32bit unsigned multiplication DIV_INF, // Divide with infinity returned on zero divisor diff --git a/lib/Target/R600/AMDILISelLowering.cpp b/lib/Target/R600/AMDILISelLowering.cpp index 8bfd30c..1dd0270 100644 --- a/lib/Target/R600/AMDILISelLowering.cpp +++ b/lib/Target/R600/AMDILISelLowering.cpp @@ -451,7 +451,8 @@ AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG DAG) const { SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq); // float fr = mad(fqneg, fb, fa); - SDValue fr = DAG.getNode(AMDGPUISD::MAD, DL, FLTTY, fqneg, fb, fa); + SDValue fr = DAG.getNode(ISD::FADD, DL, FLTTY, + DAG.getNode(ISD::MUL, DL, FLTTY, fqneg, fb), fa); // int iq = (int)fq; SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq); diff --git a/lib/Target/R600/AMDILInstrInfo.td b/lib/Target/R600/AMDILInstrInfo.td index e969bbf..110f147 100644 --- a/lib/Target/R600/AMDILInstrInfo.td +++ b/lib/Target/R600/AMDILInstrInfo.td @@ -116,7 +116,6 @@ def IL_retflag : SDNodeAMDGPUISD::RET_FLAG, SDTNone, //======// // Floating point math functions def IL_div_inf : SDNodeAMDGPUISD::DIV_INF, SDTIL_GenBinaryOp; -def IL_mad : SDNodeAMDGPUISD::MAD, SDTIL_GenTernaryOp; //===--===// // Integer functions diff --git a/lib/Target/R600/AMDILIntrinsics.td b/lib/Target/R600/AMDILIntrinsics.td index 3f9e20f..6ec3559 100644 --- a/lib/Target/R600/AMDILIntrinsics.td +++ b/lib/Target/R600/AMDILIntrinsics.td @@ -92,12 +92,6 @@ let TargetPrefix = AMDIL, isTarget = 1 in { TernaryIntInt; def int_AMDIL_bfm : GCCBuiltin__amdil_bfm, BinaryIntInt; - def int_AMDIL_mad_i32 : GCCBuiltin__amdil_imad, - TernaryIntInt; - def int_AMDIL_mad_u32 : GCCBuiltin__amdil_umad, - TernaryIntInt; - def int_AMDIL_mad : GCCBuiltin__amdil_mad, - TernaryIntFloat; def int_AMDIL_mulhi_i32 : GCCBuiltin__amdil_imul_high, BinaryIntInt; def int_AMDIL_mulhi_u32 : GCCBuiltin__amdil_umul_high, @@ -110,10 +104,6 @@ let TargetPrefix = AMDIL, isTarget = 1 in { BinaryIntInt; def int_AMDIL_mulhi24_u32 : GCCBuiltin__amdil_umul24_high, BinaryIntInt; - def int_AMDIL_mad24_i32 :
[Mesa-dev] [PATCH] R600: Do not fold modifier/litterals in vector inst
This fixes a couple of regressions on (probably not just) cayman --- lib/Target/R600/AMDILISelDAGToDAG.cpp | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp b/lib/Target/R600/AMDILISelDAGToDAG.cpp index 84223f6..7fc3a2f 100644 --- a/lib/Target/R600/AMDILISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp @@ -229,7 +229,9 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { continue; } } else { -if (!TII-isALUInstr(Use-getMachineOpcode())) { +if (!TII-isALUInstr(Use-getMachineOpcode()) || +(TII-get(Use-getMachineOpcode()).TSFlags +R600_InstFlag::VECTOR)) { continue; } @@ -272,7 +274,8 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { if (ST.device()-getGeneration() = AMDGPUDeviceInfo::HD6XXX) { const R600InstrInfo *TII = static_castconst R600InstrInfo*(TM.getInstrInfo()); -if (Result Result-isMachineOpcode() +if (Result Result-isMachineOpcode() +!(TII-get(Result-getMachineOpcode()).TSFlags R600_InstFlag::VECTOR) TII-isALUInstr(Result-getMachineOpcode())) { // Fold FNEG/FABS/CONST_ADDRESS // TODO: Isel can generate multiple MachineInst, we need to recursively -- 1.8.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/4] R600: Use MULADD_IEEE instruction for mad pattern
--- lib/Target/R600/AMDGPUISelLowering.cpp | 6 +++--- lib/Target/R600/AMDILISelLowering.cpp | 3 ++- lib/Target/R600/R600Instructions.td| 8 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index f3a047a..40c2f5f 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -176,9 +176,9 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op, Op.getOperand(1)); SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA, Op.getOperand(3)); - return DAG.getNode(AMDGPUISD::MAD, DL, VT, Op.getOperand(1), - Op.getOperand(2), - OneSubAC); + return DAG.getNode(ISD::FADD, DL, VT, + DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), Op.getOperand(2)), + OneSubAC); } /// \brief Generate Min/Max node diff --git a/lib/Target/R600/AMDILISelLowering.cpp b/lib/Target/R600/AMDILISelLowering.cpp index 8bfd30c..1dd0270 100644 --- a/lib/Target/R600/AMDILISelLowering.cpp +++ b/lib/Target/R600/AMDILISelLowering.cpp @@ -451,7 +451,8 @@ AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG DAG) const { SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq); // float fr = mad(fqneg, fb, fa); - SDValue fr = DAG.getNode(AMDGPUISD::MAD, DL, FLTTY, fqneg, fb, fa); + SDValue fr = DAG.getNode(ISD::FADD, DL, FLTTY, + DAG.getNode(ISD::MUL, DL, FLTTY, fqneg, fb), fa); // int iq = (int)fq; SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq); diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index bcbb5a1..d3cee56 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -916,6 +916,12 @@ class MULADD_Common bits5 inst : R600_3OP (IL_mad R600_Reg32:$src0, R600_Reg32:$src1, R600_Reg32:$src2))] ; +class MULADD_IEEE_Common bits5 inst : R600_3OP + inst, MULADD_IEEE, + [(set (f32 R600_Reg32:$dst), + (fadd (fmul R600_Reg32:$src0, R600_Reg32:$src1), R600_Reg32:$src2))] +; + class CNDE_Common bits5 inst : R600_3OP inst, CNDE, [(set R600_Reg32:$dst, @@ -1070,6 +1076,7 @@ let Predicates = [isR600] in { def MUL_LIT_r600 : MUL_LIT_Common0x0C; def MULADD_r600 : MULADD_Common0x10; + def MULADD_IEEE_r600 : MULADD_IEEE_Common0x14; def CNDE_r600 : CNDE_Common0x18; def CNDGT_r600 : CNDGT_Common0x19; def CNDGE_r600 : CNDGE_Common0x1A; @@ -1209,6 +1216,7 @@ let Predicates = [isEGorCayman] in { ; def MULADD_eg : MULADD_Common0x14; + def MULADD_IEEE_eg : MULADD_IEEE_Common0x18; def ASHR_eg : ASHR_Common0x15; def LSHR_eg : LSHR_Common0x16; def LSHL_eg : LSHL_Common0x17; -- 1.8.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/4] R600: Do not fold modifier/litterals in vector inst
This fixes a couple of regressions on (probably not just) cayman --- lib/Target/R600/AMDILISelDAGToDAG.cpp | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp b/lib/Target/R600/AMDILISelDAGToDAG.cpp index 84223f6..7fc3a2f 100644 --- a/lib/Target/R600/AMDILISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp @@ -229,7 +229,9 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { continue; } } else { -if (!TII-isALUInstr(Use-getMachineOpcode())) { +if (!TII-isALUInstr(Use-getMachineOpcode()) || +(TII-get(Use-getMachineOpcode()).TSFlags +R600_InstFlag::VECTOR)) { continue; } @@ -272,7 +274,8 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { if (ST.device()-getGeneration() = AMDGPUDeviceInfo::HD6XXX) { const R600InstrInfo *TII = static_castconst R600InstrInfo*(TM.getInstrInfo()); -if (Result Result-isMachineOpcode() +if (Result Result-isMachineOpcode() +!(TII-get(Result-getMachineOpcode()).TSFlags R600_InstFlag::VECTOR) TII-isALUInstr(Result-getMachineOpcode())) { // Fold FNEG/FABS/CONST_ADDRESS // TODO: Isel can generate multiple MachineInst, we need to recursively -- 1.8.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 3/4] R600: Fold zero/one in export instructions
--- lib/Target/R600/R600ISelLowering.cpp | 111 --- lib/Target/R600/R600Instructions.td | 20 ++- lib/Target/R600/R600Intrinsics.td| 3 - 3 files changed, 55 insertions(+), 79 deletions(-) diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index 4dc6729..f796738 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -279,57 +279,6 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( using namespace llvm::Intrinsic; using namespace llvm::AMDGPUIntrinsic; -static SDValue -InsertScalarToRegisterExport(SelectionDAG DAG, DebugLoc DL, SDNode **ExportMap, -unsigned Slot, unsigned Channel, unsigned Inst, unsigned Type, -SDValue Scalar, SDValue Chain) { - if (!ExportMap[Slot]) { -SDValue Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT, - DL, MVT::v4f32, - DAG.getUNDEF(MVT::v4f32), - Scalar, - DAG.getConstant(Channel, MVT::i32)); - -unsigned Mask = 1 Channel; - -const SDValue Ops[] = {Chain, Vector, DAG.getConstant(Inst, MVT::i32), -DAG.getConstant(Type, MVT::i32), DAG.getConstant(Slot, MVT::i32), -DAG.getConstant(Mask, MVT::i32)}; - -SDValue Res = DAG.getNode( -AMDGPUISD::EXPORT, -DL, -MVT::Other, -Ops, 6); - ExportMap[Slot] = Res.getNode(); - return Res; - } - - SDNode *ExportInstruction = (SDNode *) ExportMap[Slot] ; - SDValue PreviousVector = ExportInstruction-getOperand(1); - SDValue Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT, - DL, MVT::v4f32, - PreviousVector, - Scalar, - DAG.getConstant(Channel, MVT::i32)); - - unsigned Mask = dyn_castConstantSDNode(ExportInstruction-getOperand(5)) - -getZExtValue(); - Mask |= (1 Channel); - - const SDValue Ops[] = {ExportInstruction-getOperand(0), Vector, - DAG.getConstant(Inst, MVT::i32), - DAG.getConstant(Type, MVT::i32), - DAG.getConstant(Slot, MVT::i32), - DAG.getConstant(Mask, MVT::i32)}; - - DAG.UpdateNodeOperands(ExportInstruction, - Ops, 6); - - return Chain; - -} - SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG DAG) const { switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); @@ -356,16 +305,19 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG DAG) const } return DAG.getCopyToReg(Chain, Op.getDebugLoc(), Reg, Op.getOperand(2)); } -case AMDGPUIntrinsic::R600_store_pixel_color: { - MachineFunction MF = DAG.getMachineFunction(); - R600MachineFunctionInfo *MFI = MF.getInfoR600MachineFunctionInfo(); - int64_t RegIndex = castConstantSDNode(Op.getOperand(3))-getZExtValue(); - - SDNode **OutputsMap = MFI-Outputs; - return InsertScalarToRegisterExport(DAG, Op.getDebugLoc(), OutputsMap, - RegIndex / 4, RegIndex % 4, 0, 0, Op.getOperand(2), - Chain); - +case AMDGPUIntrinsic::R600_store_swizzle: { + const SDValue Args[8] = { +Chain, +Op.getOperand(2), // Export Value +Op.getOperand(3), // ArrayBase +Op.getOperand(4), // Type +DAG.getConstant(0, MVT::i32), // SWZ_X +DAG.getConstant(1, MVT::i32), // SWZ_Y +DAG.getConstant(2, MVT::i32), // SWZ_Z +DAG.getConstant(3, MVT::i32) // SWZ_W + }; + return DAG.getNode(AMDGPUISD::EXPORT, Op.getDebugLoc(), Op.getValueType(), + Args, 8); } // default for switch(IntrinsicID) @@ -962,6 +914,43 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, } } } + case AMDGPUISD::EXPORT: { +SDValue Arg = N-getOperand(1); +if (Arg.getOpcode() != ISD::BUILD_VECTOR) + break; +SDValue NewBldVec[4] = { +DAG.getUNDEF(MVT::f32), +DAG.getUNDEF(MVT::f32), +DAG.getUNDEF(MVT::f32), +DAG.getUNDEF(MVT::f32) + }; +SDValue NewArgs[8] = { + N-getOperand(0), // Chain + SDValue(), + N-getOperand(2), // ArrayBase + N-getOperand(3), // Type + N-getOperand(4), // SWZ_X + N-getOperand(5), // SWZ_Y + N-getOperand(6), // SWZ_Z + N-getOperand(7) // SWZ_W +}; +for (unsigned i = 0; i Arg.getNumOperands(); i++) { + if (ConstantFPSDNode *C = dyn_castConstantFPSDNode(Arg.getOperand(i))) { +if (C-isZero()) { + NewArgs[4 + i] = DAG.getConstant(4, MVT::i32); // SEL_0 +} else if (C-isExactlyValue(1.0)) { + NewArgs[4 + i] = DAG.getConstant(5, MVT::i32); // SEL_0 +} else { + NewBldVec[i] = Arg.getOperand(i); +} + } else { +NewBldVec[i] = Arg.getOperand(i); + } +} +DebugLoc DL = N-getDebugLoc(); +NewArgs[1] = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4f32, NewBldVec, 4); +return DAG.getNode(AMDGPUISD::EXPORT, DL, N-getVTList(), NewArgs, 8); + } } return SDValue(); } diff --git
[Mesa-dev] [PATCH 4/4] R600: Export instructions are no longer terminator
This allows MachineInstScheduler to reorder them, and thus make scheduling more efficient. --- lib/Target/R600/R600Instructions.td | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index 3c043aa..82a63df 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -625,7 +625,7 @@ multiclass SteamOutputExportPatternInstruction ExportInst, 4095, imm:$mask, buf3inst, 0); } -let isTerminator = 1, usesCustomInserter = 1 in { +let usesCustomInserter = 1 in { class ExportSwzInst : InstR600ISA( outs), @@ -639,7 +639,7 @@ class ExportSwzInst : InstR600ISA( let Inst{63-32} = Word1; } -} // End isTerminator = 1, usesCustomInserter = 1 +} // End usesCustomInserter = 1 class ExportBufInst : InstR600ISA( outs), -- 1.8.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] R600: Make store_dummy intrinsic more general by passing export type
--- lib/Target/R600/R600Instructions.td | 9 +++-- lib/Target/R600/R600Intrinsics.td | 4 ++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index 13293b6..3537906 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -608,9 +608,14 @@ multiclass ExportPatternInstruction ExportInst, bits8 cf_inst { 0, 61, 7, 0, 7, 7, cf_inst, 0) ; - def : Pat(int_R600_store_pixel_dummy), + def : Pat(int_R600_store_dummy (i32 imm:$type)), (ExportInst -(v4f32 (IMPLICIT_DEF)), 0, 0, 7, 7, 7, 7, cf_inst, 0) +(v4f32 (IMPLICIT_DEF)), imm:$type, 0, 7, 7, 7, 7, cf_inst, 0) + ; + + def : Pat(int_R600_store_dummy 1), +(ExportInst +(v4f32 (IMPLICIT_DEF)), 1, 60, 7, 7, 7, 7, cf_inst, 0) ; def : Pat(EXPORT (v4f32 R600_Reg128:$src), (i32 imm:$base), (i32 imm:$type), diff --git a/lib/Target/R600/R600Intrinsics.td b/lib/Target/R600/R600Intrinsics.td index 4c652a6..b5e4f1e 100644 --- a/lib/Target/R600/R600Intrinsics.td +++ b/lib/Target/R600/R600Intrinsics.td @@ -24,6 +24,6 @@ let TargetPrefix = R600, isTarget = 1 in { Intrinsic[], [llvm_float_ty], []; def int_R600_store_pixel_stencil : Intrinsic[], [llvm_float_ty], []; - def int_R600_store_pixel_dummy : - Intrinsic[], [], []; + def int_R600_store_dummy : + Intrinsic[], [llvm_i32_ty], []; } -- 1.8.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] r600g/llvm: Add dummy export for vs output
Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=59588 --- src/gallium/drivers/r600/r600_llvm.c | 22 -- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c index 32b8e56..913dccc 100644 --- a/src/gallium/drivers/r600/r600_llvm.c +++ b/src/gallium/drivers/r600/r600_llvm.c @@ -374,9 +374,27 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base) } } } + // Add dummy exports + if (ctx-type == TGSI_PROCESSOR_VERTEX) { + if (!next_param) { + lp_build_intrinsic_unary(base-gallivm-builder, llvm.R600.store.dummy, + LLVMVoidTypeInContext(base-gallivm-context), + lp_build_const_int32(base-gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM)); + } + if (!(next_pos-60)) { + lp_build_intrinsic_unary(base-gallivm-builder, llvm.R600.store.dummy, + LLVMVoidTypeInContext(base-gallivm-context), + lp_build_const_int32(base-gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS)); + } + } + if (ctx-type == TGSI_PROCESSOR_FRAGMENT) { + if (!has_color) { + lp_build_intrinsic_unary(base-gallivm-builder, llvm.R600.store.dummy, + LLVMVoidTypeInContext(base-gallivm-context), + lp_build_const_int32(base-gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL)); + } + } - if (!has_color ctx-type == TGSI_PROCESSOR_FRAGMENT) - lp_build_intrinsic(base-gallivm-builder, llvm.R600.store.pixel.dummy, LLVMVoidTypeInContext(base-gallivm-context), 0, 0); } static void llvm_emit_tex( -- 1.8.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] R600: Fold remaining CONST_COPY after expand pseudo inst
--- lib/Target/R600/AMDGPUTargetMachine.cpp | 2 +- lib/Target/R600/R600LowerConstCopy.cpp | 170 +--- 2 files changed, 160 insertions(+), 12 deletions(-) diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp index 7b069e7..2185be3 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.cpp +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp @@ -136,8 +136,8 @@ bool AMDGPUPassConfig::addPreEmitPass() { addPass(createAMDGPUCFGPreparationPass(*TM)); addPass(createAMDGPUCFGStructurizerPass(*TM)); addPass(createR600ExpandSpecialInstrsPass(*TM)); -addPass(createR600LowerConstCopy(*TM)); addPass(FinalizeMachineBundlesID); +addPass(createR600LowerConstCopy(*TM)); } else { addPass(createSILowerLiteralConstantsPass(*TM)); addPass(createSILowerControlFlowPass(*TM)); diff --git a/lib/Target/R600/R600LowerConstCopy.cpp b/lib/Target/R600/R600LowerConstCopy.cpp index d14ae20..2557e8f 100644 --- a/lib/Target/R600/R600LowerConstCopy.cpp +++ b/lib/Target/R600/R600LowerConstCopy.cpp @@ -13,7 +13,6 @@ /// fold them inside vector instruction, like DOT4 or Cube ; ISel emits /// ConstCopy instead. This pass (executed after ExpandingSpecialInstr) will try /// to fold them if possible or replace them by MOV otherwise. -/// TODO : Implement the folding part, using Copy Propagation algorithm. // //===--===// @@ -30,6 +29,13 @@ class R600LowerConstCopy : public MachineFunctionPass { private: static char ID; const R600InstrInfo *TII; + + struct ConstPairs { +unsigned XYPair; +unsigned ZWPair; + }; + + bool canFoldInBundle(ConstPairs UsedConst, unsigned ReadConst) const; public: R600LowerConstCopy(TargetMachine tm); virtual bool runOnMachineFunction(MachineFunction MF); @@ -39,27 +45,169 @@ public: char R600LowerConstCopy::ID = 0; - R600LowerConstCopy::R600LowerConstCopy(TargetMachine tm) : MachineFunctionPass(ID), TII (static_castconst R600InstrInfo *(tm.getInstrInfo())) { } +bool R600LowerConstCopy::canFoldInBundle(ConstPairs UsedConst, +unsigned ReadConst) const { + unsigned ReadConstChan = ReadConst 3; + unsigned ReadConstIndex = ReadConst (~3); + if (ReadConstChan 2) { +if (!UsedConst.XYPair) { + UsedConst.XYPair = ReadConstIndex; +} +return UsedConst.XYPair == ReadConstIndex; + } else { +if (!UsedConst.ZWPair) { + UsedConst.ZWPair = ReadConstIndex; +} +return UsedConst.ZWPair == ReadConstIndex; + } +} + +static bool isControlFlow(const MachineInstr MI) { + return (MI.getOpcode() == AMDGPU::IF_PREDICATE_SET) || + (MI.getOpcode() == AMDGPU::ENDIF) || + (MI.getOpcode() == AMDGPU::ELSE) || + (MI.getOpcode() == AMDGPU::WHILELOOP) || + (MI.getOpcode() == AMDGPU::BREAK); +} + bool R600LowerConstCopy::runOnMachineFunction(MachineFunction MF) { + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); BB != BB_E; ++BB) { MachineBasicBlock MBB = *BB; -for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - I != E;) { - MachineInstr MI = *I; - I = llvm::next(I); - if (MI.getOpcode() != AMDGPU::CONST_COPY) +DenseMapunsigned, MachineInstr * RegToConstIndex; +for (MachineBasicBlock::instr_iterator I = MBB.instr_begin(), +E = MBB.instr_end(); I != E;) { + + if (I-getOpcode() == AMDGPU::CONST_COPY) { +MachineInstr MI = *I; +I = llvm::next(I); +unsigned DstReg = MI.getOperand(0).getReg(); +DenseMapunsigned, MachineInstr *::iterator SrcMI = +RegToConstIndex.find(DstReg); +if (SrcMI != RegToConstIndex.end()) { + SrcMI-second-eraseFromParent(); + RegToConstIndex.erase(SrcMI); +} +MachineInstr *NewMI = +TII-buildDefaultInstruction(MBB, MI, AMDGPU::MOV, +MI.getOperand(0).getReg(), AMDGPU::ALU_CONST); +TII-setImmOperand(NewMI, R600Operands::SRC0_SEL, +MI.getOperand(1).getImm()); +RegToConstIndex[DstReg] = NewMI; +MI.eraseFromParent(); continue; - MachineInstr *NewMI = TII-buildDefaultInstruction(MBB, I, AMDGPU::MOV, - MI.getOperand(0).getReg(), AMDGPU::ALU_CONST); - NewMI-getOperand(9).setImm(MI.getOperand(1).getImm()); - MI.eraseFromParent(); + } + + std::vectorunsigned Defs; + // We consider all Instructions as bundled because algorithm that handle + // const read port limitations inside an IG is still valid with single + // instructions. + std::vectorMachineInstr * Bundle; + + if (I-isBundle()) { +unsigned BundleSize = I-getBundleSize(); +for (unsigned i = 0; i BundleSize; i++) { + I = llvm::next(I); + Bundle.push_back(I); +} + } else if
[Mesa-dev] [PATCH 1/2] R600: Fold remaining CONST_COPY after expand pseudo inst
v2:fix a bug with write masked inst --- lib/Target/R600/AMDGPUTargetMachine.cpp | 2 +- lib/Target/R600/R600LowerConstCopy.cpp | 164 +--- 2 files changed, 154 insertions(+), 12 deletions(-) diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp index 7b069e7..2185be3 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.cpp +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp @@ -136,8 +136,8 @@ bool AMDGPUPassConfig::addPreEmitPass() { addPass(createAMDGPUCFGPreparationPass(*TM)); addPass(createAMDGPUCFGStructurizerPass(*TM)); addPass(createR600ExpandSpecialInstrsPass(*TM)); -addPass(createR600LowerConstCopy(*TM)); addPass(FinalizeMachineBundlesID); +addPass(createR600LowerConstCopy(*TM)); } else { addPass(createSILowerLiteralConstantsPass(*TM)); addPass(createSILowerControlFlowPass(*TM)); diff --git a/lib/Target/R600/R600LowerConstCopy.cpp b/lib/Target/R600/R600LowerConstCopy.cpp index d14ae20..74260ad 100644 --- a/lib/Target/R600/R600LowerConstCopy.cpp +++ b/lib/Target/R600/R600LowerConstCopy.cpp @@ -13,7 +13,6 @@ /// fold them inside vector instruction, like DOT4 or Cube ; ISel emits /// ConstCopy instead. This pass (executed after ExpandingSpecialInstr) will try /// to fold them if possible or replace them by MOV otherwise. -/// TODO : Implement the folding part, using Copy Propagation algorithm. // //===--===// @@ -28,8 +27,16 @@ namespace llvm { class R600LowerConstCopy : public MachineFunctionPass { private: + typedef DenseMapunsigned, MachineInstr * SourceMap; static char ID; const R600InstrInfo *TII; + + struct ConstPairs { +unsigned XYPair; +unsigned ZWPair; + }; + + bool canFoldInBundle(ConstPairs UsedConst, unsigned ReadConst) const; public: R600LowerConstCopy(TargetMachine tm); virtual bool runOnMachineFunction(MachineFunction MF); @@ -39,27 +46,162 @@ public: char R600LowerConstCopy::ID = 0; - R600LowerConstCopy::R600LowerConstCopy(TargetMachine tm) : MachineFunctionPass(ID), TII (static_castconst R600InstrInfo *(tm.getInstrInfo())) { } +bool R600LowerConstCopy::canFoldInBundle(ConstPairs UsedConst, +unsigned ReadConst) const { + unsigned ReadConstChan = ReadConst 3; + unsigned ReadConstIndex = ReadConst (~3); + if (ReadConstChan 2) { +if (!UsedConst.XYPair) { + UsedConst.XYPair = ReadConstIndex; +} +return UsedConst.XYPair == ReadConstIndex; + } else { +if (!UsedConst.ZWPair) { + UsedConst.ZWPair = ReadConstIndex; +} +return UsedConst.ZWPair == ReadConstIndex; + } +} + +static bool isControlFlow(const MachineInstr MI) { + return (MI.getOpcode() == AMDGPU::IF_PREDICATE_SET) || + (MI.getOpcode() == AMDGPU::ENDIF) || + (MI.getOpcode() == AMDGPU::ELSE) || + (MI.getOpcode() == AMDGPU::WHILELOOP) || + (MI.getOpcode() == AMDGPU::BREAK); +} + bool R600LowerConstCopy::runOnMachineFunction(MachineFunction MF) { + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); BB != BB_E; ++BB) { MachineBasicBlock MBB = *BB; -for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - I != E;) { - MachineInstr MI = *I; - I = llvm::next(I); - if (MI.getOpcode() != AMDGPU::CONST_COPY) +SourceMap RegToConstIndex; +for (MachineBasicBlock::instr_iterator I = MBB.instr_begin(), +E = MBB.instr_end(); I != E;) { + + if (I-getOpcode() == AMDGPU::CONST_COPY) { +MachineInstr MI = *I; +I = llvm::next(I); +unsigned DstReg = MI.getOperand(0).getReg(); +SourceMap::iterator SrcMI = RegToConstIndex.find(DstReg); +if (SrcMI != RegToConstIndex.end()) { + SrcMI-second-eraseFromParent(); + RegToConstIndex.erase(SrcMI); +} +MachineInstr *NewMI = +TII-buildDefaultInstruction(MBB, MI, AMDGPU::MOV, +MI.getOperand(0).getReg(), AMDGPU::ALU_CONST); +NewMI-getOperand(9).setImm(MI.getOperand(1).getImm()); +RegToConstIndex[DstReg] = NewMI; +MI.eraseFromParent(); continue; - MachineInstr *NewMI = TII-buildDefaultInstruction(MBB, I, AMDGPU::MOV, - MI.getOperand(0).getReg(), AMDGPU::ALU_CONST); - NewMI-getOperand(9).setImm(MI.getOperand(1).getImm()); - MI.eraseFromParent(); + } + + std::vectorunsigned Defs; + // We consider all Instructions as bundled because algorithm that handle + // const read port limitations inside an IG is still valid with single + // instructions. + std::vectorMachineInstr * Bundle; + + if (I-isBundle()) { +unsigned BundleSize = I-getBundleSize(); +for (unsigned i = 0; i BundleSize; i++) { + I = llvm::next(I); + Bundle.push_back(I); +} + }