Module: Mesa Branch: main Commit: cc68b7cd944f22e59e574891236840b8d63459d1 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=cc68b7cd944f22e59e574891236840b8d63459d1
Author: Jason Ekstrand <[email protected]> Date: Fri Sep 2 22:16:05 2022 -0500 hasvk/pipeline: Rip out SKL+ v2: Fix incorrect DispatchMode removal (Lionel) Reviewed-by: Lionel Landwerlin <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19852> --- src/intel/vulkan_hasvk/genX_pipeline.c | 258 ++------------------------------- 1 file changed, 13 insertions(+), 245 deletions(-) diff --git a/src/intel/vulkan_hasvk/genX_pipeline.c b/src/intel/vulkan_hasvk/genX_pipeline.c index b35d5759c3d..f782806b66c 100644 --- a/src/intel/vulkan_hasvk/genX_pipeline.c +++ b/src/intel/vulkan_hasvk/genX_pipeline.c @@ -354,11 +354,6 @@ emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline) .ConstantInterpolationEnable = wm_prog_data->flat_inputs, }; -#if GFX_VER >= 9 - for (unsigned i = 0; i < 32; i++) - sbe.AttributeActiveComponentFormat[i] = ACF_XYZW; -#endif - #if GFX_VER >= 8 /* On Broadwell, they broke 3DSTATE_SBE into two packets */ struct GENX(3DSTATE_SBE_SWIZ) swiz = { @@ -584,16 +579,12 @@ genX(rasterization_mode)(VkPolygonMode raster_mode, switch (line_mode) { case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT: *api_mode = DX100; -#if GFX_VER <= 9 - /* Prior to ICL, the algorithm the HW uses to draw wide lines - * doesn't quite match what the CTS expects, at least for rectangular - * lines, so we set this to false here, making it draw parallelograms - * instead, which work well enough. + /* The algorithm the HW uses to draw wide lines doesn't quite match + * what the CTS expects, at least for rectangular lines, so we set + * this to false here, making it draw parallelograms instead, which + * work well enough. */ *msaa_rasterization_enable = line_width < 1.0078125; -#else - *msaa_rasterization_enable = true; -#endif break; case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT: @@ -652,10 +643,6 @@ emit_rs_state(struct anv_graphics_pipeline *pipeline, sf.LineStippleEnable = rs->line.stipple.enable; #endif -#if GFX_VER >= 12 - sf.DerefBlockSize = urb_deref_block_size; -#endif - bool point_from_shader; const struct brw_vue_prog_data *last_vue_prog_data = anv_pipeline_get_last_vue_prog_data(pipeline); @@ -692,19 +679,10 @@ emit_rs_state(struct anv_graphics_pipeline *pipeline, raster.BackFaceFillMode = genX(vk_to_intel_fillmode)[rs->polygon_mode]; raster.ScissorRectangleEnable = true; -#if GFX_VER >= 9 - /* GFX9+ splits ViewportZClipTestEnable into near and far enable bits */ - raster.ViewportZFarClipTestEnable = pipeline->depth_clip_enable; - raster.ViewportZNearClipTestEnable = pipeline->depth_clip_enable; -#elif GFX_VER >= 8 +#if GFX_VER >= 8 raster.ViewportZClipTestEnable = pipeline->depth_clip_enable; #endif -#if GFX_VER >= 9 - raster.ConservativeRasterizationEnable = - rs->conservative_mode != VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT; -#endif - #if GFX_VER == 7 /* Gfx7 requires that we provide the depth format in 3DSTATE_SF so that it * can get the depth offsets correct. @@ -1380,44 +1358,20 @@ emit_3dstate_vs(struct anv_graphics_pipeline *pipeline) #endif assert(!vs_prog_data->base.base.use_alt_mode); -#if GFX_VER < 11 vs.SingleVertexDispatch = false; -#endif vs.VectorMaskEnable = false; /* Wa_1606682166: * Incorrect TDL's SSP address shift in SARB for 16:6 & 18:8 modes. * Disable the Sampler state prefetch functionality in the SARB by * programming 0xB000[30] to '1'. */ - vs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(vs_bin); + vs.SamplerCount = get_sampler_count(vs_bin); vs.BindingTableEntryCount = vs_bin->bind_map.surface_count; vs.FloatingPointMode = IEEE754; vs.IllegalOpcodeExceptionEnable = false; vs.SoftwareExceptionEnable = false; vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1; - if (GFX_VER == 9 && devinfo->gt == 4 && - anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) { - /* On Sky Lake GT4, we have experienced some hangs related to the VS - * cache and tessellation. It is unknown exactly what is happening - * but the Haswell docs for the "VS Reference Count Full Force Miss - * Enable" field of the "Thread Mode" register refer to a HSW bug in - * which the VUE handle reference count would overflow resulting in - * internal reference counting bugs. My (Jason's) best guess is that - * this bug cropped back up on SKL GT4 when we suddenly had more - * threads in play than any previous gfx9 hardware. - * - * What we do know for sure is that setting this bit when - * tessellation shaders are in use fixes a GPU hang in Batman: Arkham - * City when playing with DXVK (https://bugs.freedesktop.org/107280). - * Disabling the vertex cache with tessellation shaders should only - * have a minor performance impact as the tessellation shaders are - * likely generating and processing far more geometry than the vertex - * stage. - */ - vs.VertexCacheDisable = true; - } - vs.VertexURBEntryReadLength = vs_prog_data->base.urb_read_length; vs.VertexURBEntryReadOffset = 0; vs.DispatchGRFStartRegisterForURBData = @@ -1430,14 +1384,9 @@ emit_3dstate_vs(struct anv_graphics_pipeline *pipeline) vs_prog_data->base.cull_distance_mask; #endif -#if GFX_VERx10 >= 125 - vs.ScratchSpaceBuffer = - get_scratch_surf(&pipeline->base, MESA_SHADER_VERTEX, vs_bin); -#else vs.PerThreadScratchSpace = get_scratch_space(vs_bin); vs.ScratchSpaceBasePointer = get_scratch_address(&pipeline->base, MESA_SHADER_VERTEX, vs_bin); -#endif } } @@ -1466,19 +1415,9 @@ emit_3dstate_hs_te_ds(struct anv_graphics_pipeline *pipeline, hs.StatisticsEnable = true; hs.KernelStartPointer = tcs_bin->kernel.offset; /* Wa_1606682166 */ - hs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tcs_bin); + hs.SamplerCount = get_sampler_count(tcs_bin); hs.BindingTableEntryCount = tcs_bin->bind_map.surface_count; -#if GFX_VER >= 12 - /* Wa_1604578095: - * - * Hang occurs when the number of max threads is less than 2 times - * the number of instance count. The number of max threads must be - * more than 2 times the number of instance count. - */ - assert((devinfo->max_tcs_threads / 2) > tcs_prog_data->instances); -#endif - hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1; hs.IncludeVertexHandles = true; hs.InstanceCount = tcs_prog_data->instances - 1; @@ -1487,31 +1426,10 @@ emit_3dstate_hs_te_ds(struct anv_graphics_pipeline *pipeline, hs.VertexURBEntryReadOffset = 0; hs.DispatchGRFStartRegisterForURBData = tcs_prog_data->base.base.dispatch_grf_start_reg & 0x1f; -#if GFX_VER >= 12 - hs.DispatchGRFStartRegisterForURBData5 = - tcs_prog_data->base.base.dispatch_grf_start_reg >> 5; -#endif -#if GFX_VERx10 >= 125 - hs.ScratchSpaceBuffer = - get_scratch_surf(&pipeline->base, MESA_SHADER_TESS_CTRL, tcs_bin); -#else hs.PerThreadScratchSpace = get_scratch_space(tcs_bin); hs.ScratchSpaceBasePointer = get_scratch_address(&pipeline->base, MESA_SHADER_TESS_CTRL, tcs_bin); -#endif - -#if GFX_VER == 12 - /* Patch Count threshold specifies the maximum number of patches that - * will be accumulated before a thread dispatch is forced. - */ - hs.PatchCountThreshold = tcs_prog_data->patch_count_threshold; -#endif - -#if GFX_VER >= 9 - hs.DispatchMode = tcs_prog_data->base.dispatch_mode; - hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id; -#endif } anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TE), te) { @@ -1534,16 +1452,6 @@ emit_3dstate_hs_te_ds(struct anv_graphics_pipeline *pipeline, te.TEEnable = true; te.MaximumTessellationFactorOdd = 63.0; te.MaximumTessellationFactorNotOdd = 64.0; -#if GFX_VERx10 >= 125 - te.TessellationDistributionMode = TEDMODE_RR_FREE; - te.TessellationDistributionLevel = TEDLEVEL_PATCH; - /* 64_TRIANGLES */ - te.SmallPatchThreshold = 3; - /* 1K_TRIANGLES */ - te.TargetBlockSize = 8; - /* 1K_TRIANGLES */ - te.LocalBOPAccumulatorThreshold = 1; -#endif } anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_DS), ds) { @@ -1551,7 +1459,7 @@ emit_3dstate_hs_te_ds(struct anv_graphics_pipeline *pipeline, ds.StatisticsEnable = true; ds.KernelStartPointer = tes_bin->kernel.offset; /* Wa_1606682166 */ - ds.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tes_bin); + ds.SamplerCount = get_sampler_count(tes_bin); ds.BindingTableEntryCount = tes_bin->bind_map.surface_count; ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1; @@ -1564,15 +1472,10 @@ emit_3dstate_hs_te_ds(struct anv_graphics_pipeline *pipeline, tes_prog_data->base.base.dispatch_grf_start_reg; #if GFX_VER >= 8 -#if GFX_VER < 11 ds.DispatchMode = tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8 ? - DISPATCH_MODE_SIMD8_SINGLE_PATCH : - DISPATCH_MODE_SIMD4X2; -#else - assert(tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8); - ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH; -#endif + DISPATCH_MODE_SIMD8_SINGLE_PATCH : + DISPATCH_MODE_SIMD4X2; ds.UserClipDistanceClipTestEnableBitmask = tes_prog_data->base.clip_distance_mask; @@ -1580,17 +1483,9 @@ emit_3dstate_hs_te_ds(struct anv_graphics_pipeline *pipeline, tes_prog_data->base.cull_distance_mask; #endif -#if GFX_VER >= 12 - ds.PrimitiveIDNotRequired = !tes_prog_data->include_primitive_id; -#endif -#if GFX_VERx10 >= 125 - ds.ScratchSpaceBuffer = - get_scratch_surf(&pipeline->base, MESA_SHADER_TESS_EVAL, tes_bin); -#else ds.PerThreadScratchSpace = get_scratch_space(tes_bin); ds.ScratchSpaceBasePointer = get_scratch_address(&pipeline->base, MESA_SHADER_TESS_EVAL, tes_bin); -#endif } } @@ -1617,7 +1512,7 @@ emit_3dstate_gs(struct anv_graphics_pipeline *pipeline) gs.SingleProgramFlow = false; gs.VectorMaskEnable = false; /* Wa_1606682166 */ - gs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(gs_bin); + gs.SamplerCount = get_sampler_count(gs_bin); gs.BindingTableEntryCount = gs_bin->bind_map.surface_count; gs.IncludeVertexHandles = gs_prog_data->base.include_vue_handles; gs.IncludePrimitiveID = gs_prog_data->include_primitive_id; @@ -1655,14 +1550,9 @@ emit_3dstate_gs(struct anv_graphics_pipeline *pipeline) gs_prog_data->base.cull_distance_mask; #endif -#if GFX_VERx10 >= 125 - gs.ScratchSpaceBuffer = - get_scratch_surf(&pipeline->base, MESA_SHADER_GEOMETRY, gs_bin); -#else gs.PerThreadScratchSpace = get_scratch_space(gs_bin); gs.ScratchSpaceBasePointer = get_scratch_address(&pipeline->base, MESA_SHADER_GEOMETRY, gs_bin); -#endif } } @@ -1810,20 +1700,6 @@ emit_3dstate_ps(struct anv_graphics_pipeline *pipeline, ps._16PixelDispatchEnable = wm_prog_data->dispatch_16; ps._32PixelDispatchEnable = wm_prog_data->dispatch_32; - /* From the Sky Lake PRM 3DSTATE_PS::32 Pixel Dispatch Enable: - * - * "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16, SIMD32 - * Dispatch must not be enabled for PER_PIXEL dispatch mode." - * - * Since 16x MSAA is first introduced on SKL, we don't need to apply - * the workaround on any older hardware. - */ - if (GFX_VER >= 9 && !wm_prog_data->persample_dispatch && - ms != NULL && ms->rasterization_samples == 16) { - assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable); - ps._32PixelDispatchEnable = false; - } - ps.KernelStartPointer0 = fs_bin->kernel.offset + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0); ps.KernelStartPointer1 = fs_bin->kernel.offset + @@ -1835,7 +1711,7 @@ emit_3dstate_ps(struct anv_graphics_pipeline *pipeline, ps.VectorMaskEnable = GFX_VER >= 8 && wm_prog_data->uses_vmask; /* Wa_1606682166 */ - ps.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(fs_bin); + ps.SamplerCount = get_sampler_count(fs_bin); ps.BindingTableEntryCount = fs_bin->bind_map.surface_count; ps.PushConstantEnable = wm_prog_data->base.nr_params > 0 || wm_prog_data->base.ubo_ranges[0].length; @@ -1868,14 +1744,9 @@ emit_3dstate_ps(struct anv_graphics_pipeline *pipeline, ps.DispatchGRFStartRegisterForConstantSetupData2 = brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2); -#if GFX_VERx10 >= 125 - ps.ScratchSpaceBuffer = - get_scratch_surf(&pipeline->base, MESA_SHADER_FRAGMENT, fs_bin); -#else ps.PerThreadScratchSpace = get_scratch_space(fs_bin); ps.ScratchSpaceBasePointer = get_scratch_address(&pipeline->base, MESA_SHADER_FRAGMENT, fs_bin); -#endif } } @@ -1911,35 +1782,7 @@ emit_3dstate_ps_extra(struct anv_graphics_pipeline *pipeline, rp->stencil_self_dependency || wm_prog_data->uses_kill; -#if GFX_VER >= 9 - ps.PixelShaderComputesStencil = wm_prog_data->computed_stencil; - ps.PixelShaderPullsBary = wm_prog_data->pulls_bary; - - ps.InputCoverageMaskState = ICMS_NONE; - assert(!wm_prog_data->inner_coverage); /* Not available in SPIR-V */ - if (!wm_prog_data->uses_sample_mask) - ps.InputCoverageMaskState = ICMS_NONE; - else if (wm_prog_data->per_coarse_pixel_dispatch) - ps.InputCoverageMaskState = ICMS_NORMAL; - else if (wm_prog_data->post_depth_coverage) - ps.InputCoverageMaskState = ICMS_DEPTH_COVERAGE; - else - ps.InputCoverageMaskState = ICMS_NORMAL; -#else ps.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask; -#endif - -#if GFX_VER >= 11 - ps.PixelShaderRequiresSourceDepthandorWPlaneCoefficients = - wm_prog_data->uses_depth_w_coefficients; - ps.PixelShaderIsPerCoarsePixel = wm_prog_data->per_coarse_pixel_dispatch; -#endif -#if GFX_VERx10 >= 125 - /* TODO: We should only require this when the last geometry shader uses - * a fragment shading rate that is not constant. - */ - ps.EnablePSDependencyOnCPsizeChange = wm_prog_data->per_coarse_pixel_dispatch; -#endif } } #endif @@ -1986,37 +1829,6 @@ compute_kill_pixel(struct anv_graphics_pipeline *pipeline, (ms && ms->alpha_to_coverage_enable); } -#if GFX_VER == 12 -static void -emit_3dstate_primitive_replication(struct anv_graphics_pipeline *pipeline, - const struct vk_render_pass_state *rp) -{ - const int replication_count = - anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map.num_pos_slots; - - assert(replication_count >= 1); - if (replication_count == 1) { - anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr); - return; - } - - uint32_t view_mask = rp->view_mask; - assert(replication_count == util_bitcount(view_mask)); - assert(replication_count <= MAX_VIEWS_FOR_PRIMITIVE_REPLICATION); - - anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) { - pr.ReplicaMask = (1 << replication_count) - 1; - pr.ReplicationCount = replication_count - 1; - - int i = 0; - u_foreach_bit(view_index, rp->view_mask) { - pr.RTAIOffset[i] = view_index; - i++; - } - } -} -#endif - void genX(graphics_pipeline_emit)(struct anv_graphics_pipeline *pipeline, const struct vk_graphics_pipeline_state *state) @@ -2033,10 +1845,6 @@ genX(graphics_pipeline_emit)(struct anv_graphics_pipeline *pipeline, emit_3dstate_clip(pipeline, state->ia, state->vp, state->rs); -#if GFX_VER == 12 - emit_3dstate_primitive_replication(pipeline, state->rp); -#endif - #if 0 /* From gfx7_vs_state.c */ @@ -2075,28 +1883,6 @@ genX(graphics_pipeline_emit)(struct anv_graphics_pipeline *pipeline, #endif } -#if GFX_VERx10 >= 125 - -void -genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline) -{ - struct anv_device *device = pipeline->base.device; - const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline); - anv_pipeline_setup_l3_config(&pipeline->base, cs_prog_data->base.total_shared > 0); - - const UNUSED struct anv_shader_bin *cs_bin = pipeline->cs; - const struct intel_device_info *devinfo = device->info; - - anv_batch_emit(&pipeline->base.batch, GENX(CFE_STATE), cfe) { - cfe.MaximumNumberofThreads = - devinfo->max_cs_threads * devinfo->subslice_total; - cfe.ScratchSpaceBuffer = - get_scratch_surf(&pipeline->base, MESA_SHADER_COMPUTE, cs_bin); - } -} - -#else /* #if GFX_VERx10 >= 125 */ - void genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline) { @@ -2123,12 +1909,8 @@ genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline) vfe.MaximumNumberofThreads = devinfo->max_cs_threads * devinfo->subslice_total - 1; vfe.NumberofURBEntries = GFX_VER <= 7 ? 0 : 2; -#if GFX_VER < 11 vfe.ResetGatewayTimer = true; -#endif -#if GFX_VER <= 8 vfe.BypassGatewayControl = true; -#endif vfe.URBEntryAllocationSize = GFX_VER <= 7 ? 0 : 2; vfe.CURBEAllocationSize = vfe_curbe_allocation; @@ -2163,7 +1945,7 @@ genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline) brw_cs_prog_data_prog_offset(cs_prog_data, dispatch.simd_size), /* Wa_1606682166 */ - .SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(cs_bin), + .SamplerCount = get_sampler_count(cs_bin), /* We add 1 because the CS indirect parameters buffer isn't accounted * for in bind_map.surface_count. */ @@ -2180,18 +1962,6 @@ genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline) .CrossThreadConstantDataReadLength = cs_prog_data->push.cross_thread.regs, #endif -#if GFX_VER >= 12 - /* TODO: Check if we are missing workarounds and enable mid-thread - * preemption. - * - * We still have issues with mid-thread preemption (it was already - * disabled by the kernel on gfx11, due to missing workarounds). It's - * possible that we are just missing some workarounds, and could enable - * it later, but for now let's disable it to fix a GPU in compute in Car - * Chase (and possibly more). - */ - .ThreadPreemptionDisable = true, -#endif .NumberofThreadsinGPGPUThreadGroup = dispatch.threads, }; @@ -2199,5 +1969,3 @@ genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline) pipeline->interface_descriptor_data, &desc); } - -#endif /* #if GFX_VERx10 >= 125 */
