Basically, this extension allows applications to use custom sample locations. This only implements the barely minimum. It doesn't support variable sample locations during subpass.
Most of the dEQP-VK.pipeline.multisample.sample_locations_ext.* CTS now pass. Only enabled on VI+ because it's untested on older chips. Signed-off-by: Samuel Pitoiset <samuel.pitoi...@gmail.com> --- src/amd/vulkan/radv_cmd_buffer.c | 177 +++++++++++++++++++++++++++++- src/amd/vulkan/radv_device.c | 27 +++++ src/amd/vulkan/radv_extensions.py | 1 + src/amd/vulkan/radv_pipeline.c | 30 +++++ src/amd/vulkan/radv_private.h | 26 +++-- 5 files changed, 253 insertions(+), 8 deletions(-) diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index b4aea5bc898..c4bebeda0ce 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -105,6 +105,7 @@ radv_bind_dynamic_state(struct radv_cmd_buffer *cmd_buffer, dest->viewport.count = src->viewport.count; dest->scissor.count = src->scissor.count; dest->discard_rectangle.count = src->discard_rectangle.count; + dest->sample_location.count = src->sample_location.count; if (copy_mask & RADV_DYNAMIC_VIEWPORT) { if (memcmp(&dest->viewport.viewports, &src->viewport.viewports, @@ -192,6 +193,22 @@ radv_bind_dynamic_state(struct radv_cmd_buffer *cmd_buffer, } } + if (copy_mask & RADV_DYNAMIC_SAMPLE_LOCATIONS) { + if (dest->sample_location.per_pixel != src->sample_location.per_pixel || + dest->sample_location.grid_size.width != src->sample_location.grid_size.width || + dest->sample_location.grid_size.height != src->sample_location.grid_size.height || + memcmp(&dest->sample_location.locations, + &src->sample_location.locations, + src->sample_location.count * sizeof(VkSampleLocationEXT))) { + dest->sample_location.per_pixel = src->sample_location.per_pixel; + dest->sample_location.grid_size = src->sample_location.grid_size; + typed_memcpy(dest->sample_location.locations, + src->sample_location.locations, + src->sample_location.count); + dest_mask |= RADV_DYNAMIC_SAMPLE_LOCATIONS; + } + } + cmd_buffer->state.dirty |= dest_mask; } @@ -634,6 +651,135 @@ radv_emit_descriptor_pointers(struct radv_cmd_buffer *cmd_buffer, } } +/** + * Convert the user sample locations to hardware sample locations (the values + * that will be emitted by PA_SC_AA_SAMPLE_LOCS_PIXEL_*). + */ +static void +radv_convert_user_sample_locs(struct radv_sample_locations_state *state, + uint32_t x, uint32_t y, VkOffset2D *sample_locs) +{ + uint32_t x_offset = x % state->grid_size.width; + uint32_t y_offset = y % state->grid_size.height; + uint32_t num_samples = (uint32_t)state->per_pixel; + VkSampleLocationEXT *user_locs; + uint32_t pixel_offset; + + pixel_offset = (x_offset + y_offset * state->grid_size.width) * num_samples; + + assert(pixel_offset <= MAX_SAMPLE_LOCATIONS); + user_locs = &state->locations[pixel_offset]; + + for (uint32_t i = 0; i < num_samples; i++) { + float shifted_pos_x = user_locs[i].x - 0.5; + float shifted_pos_y = user_locs[i].y - 0.5; + + int32_t scaled_pos_x = floor(shifted_pos_x * 16); + int32_t scaled_pos_y = floor(shifted_pos_y * 16); + + sample_locs[i].x = CLAMP(scaled_pos_x, -8, 7); + sample_locs[i].y = CLAMP(scaled_pos_y, -8, 7); + } +} + +/** + * Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask based on hardware sample + * locations. + */ +static void +radv_compute_sample_locs_pixel(uint32_t num_samples, VkOffset2D *sample_locs, + uint32_t *sample_locs_pixel) +{ + for (uint32_t i = 0; i < num_samples; ++i) { + uint32_t sample_reg_idx = i / 4; + uint32_t sample_loc_idx = i % 4; + int32_t pos_x = sample_locs[i].x; + int32_t pos_y = sample_locs[i].y; + + uint32_t shift_x = 8 * sample_loc_idx; + uint32_t shift_y = shift_x + 4; + + sample_locs_pixel[sample_reg_idx] |= (pos_x & 0xf) << shift_x; + sample_locs_pixel[sample_reg_idx] |= (pos_y & 0xf) << shift_y; + } +} + +/** + * Emit the sample locations that are specified with VK_EXT_sample_locations. + */ +static void +radv_emit_sample_locations(struct radv_cmd_buffer *cmd_buffer) +{ + struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; + struct radv_multisample_state *ms = &pipeline->graphics.ms; + struct radv_sample_locations_state *sample_location = + &cmd_buffer->state.dynamic.sample_location; + uint32_t num_samples = (uint32_t)sample_location->per_pixel; + struct radeon_cmdbuf *cs = cmd_buffer->cs; + uint32_t sample_locs_pixel[4][2] = {}; + VkOffset2D sample_locs[4][8]; /* 8 is the max. sample count supported */ + uint32_t max_sample_dist = 0; + + /* Convert the user sample locations to hardware sample locations. */ + radv_convert_user_sample_locs(sample_location, 0, 0, sample_locs[0]); + radv_convert_user_sample_locs(sample_location, 1, 0, sample_locs[1]); + radv_convert_user_sample_locs(sample_location, 0, 1, sample_locs[2]); + radv_convert_user_sample_locs(sample_location, 1, 1, sample_locs[3]); + + /* Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask. */ + for (uint32_t i = 0; i < 4; i++) { + radv_compute_sample_locs_pixel(num_samples, sample_locs[i], + sample_locs_pixel[i]); + } + + /* Emit the specified user sample locations. */ + switch (num_samples) { + case 2: + case 4: + radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_pixel[0][0]); + radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_pixel[1][0]); + radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_pixel[2][0]); + radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_pixel[3][0]); + break; + case 8: + radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_pixel[0][0]); + radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_pixel[1][0]); + radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_pixel[2][0]); + radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_pixel[3][0]); + radeon_set_context_reg(cs, R_028BFC_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1, sample_locs_pixel[0][1]); + radeon_set_context_reg(cs, R_028C0C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1, sample_locs_pixel[1][1]); + radeon_set_context_reg(cs, R_028C1C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1, sample_locs_pixel[2][1]); + radeon_set_context_reg(cs, R_028C2C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1, sample_locs_pixel[3][1]); + break; + default: + unreachable("Unsupported number of samples!"); + } + + /* Compute the maximum sample distance from the specified locations. */ + for (uint32_t i = 0; i < num_samples; i++) { + VkOffset2D offset = sample_locs[0][i]; + max_sample_dist = MAX2(max_sample_dist, + MAX2(abs(offset.x), abs(offset.y))); + } + + /* Emit the maximum sample distance if different. */ + if (G_028BE0_MAX_SAMPLE_DIST(ms->pa_sc_aa_config) != max_sample_dist) { + uint32_t pa_sc_aa_config = ms->pa_sc_aa_config; + + pa_sc_aa_config &= C_028BE0_MAX_SAMPLE_DIST; + pa_sc_aa_config |= S_028BE0_MAX_SAMPLE_DIST(max_sample_dist); + + radeon_set_context_reg_seq(cs, R_028BE0_PA_SC_AA_CONFIG, 1); + radeon_emit(cs, pa_sc_aa_config); + + /* GFX9: Flush DFSM when the AA mode changes. */ + if (cmd_buffer->device->dfsm_allowed) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0)); + } + } +} + static void radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline) @@ -645,7 +791,14 @@ radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer, if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.info.ps.needs_sample_positions) cmd_buffer->sample_positions_needed = true; - if (old_pipeline && num_samples == old_pipeline->graphics.ms.num_samples) + /* Emit the multisample state (including sample locations) only if: + * - it's the first bound pipeline in the command buffer + * - the number of samples of this pipeline is different + * - the previous pipeline used custom sample locations + */ + if (old_pipeline && + num_samples == old_pipeline->graphics.ms.num_samples && + !old_pipeline->dynamic_state.sample_location.count) return; radeon_set_context_reg_seq(cmd_buffer->cs, R_028BDC_PA_SC_LINE_CNTL, 2); @@ -1711,6 +1864,9 @@ radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer) if (states & RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE) radv_emit_discard_rectangle(cmd_buffer); + if (states & RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS) + radv_emit_sample_locations(cmd_buffer); + cmd_buffer->state.dirty &= ~states; } @@ -3050,6 +3206,25 @@ void radv_CmdSetDiscardRectangleEXT( state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE; } +void radv_CmdSetSampleLocationsEXT( + VkCommandBuffer commandBuffer, + const VkSampleLocationsInfoEXT* pSampleLocationsInfo) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + struct radv_cmd_state *state = &cmd_buffer->state; + + assert(pSampleLocationsInfo->sampleLocationsCount <= MAX_SAMPLE_LOCATIONS); + + state->dynamic.sample_location.per_pixel = pSampleLocationsInfo->sampleLocationsPerPixel; + state->dynamic.sample_location.grid_size = pSampleLocationsInfo->sampleLocationGridSize; + state->dynamic.sample_location.count = pSampleLocationsInfo->sampleLocationsCount; + typed_memcpy(&state->dynamic.sample_location.locations[0], + pSampleLocationsInfo->pSampleLocations, + pSampleLocationsInfo->sampleLocationsCount); + + state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS; +} + void radv_CmdExecuteCommands( VkCommandBuffer commandBuffer, uint32_t commandBufferCount, diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index cb51ee44e58..6b19641f66d 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -1238,6 +1238,19 @@ void radv_GetPhysicalDeviceProperties2( properties->transformFeedbackDraw = true; break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLE_LOCATIONS_PROPERTIES_EXT: { + VkPhysicalDeviceSampleLocationsPropertiesEXT *properties = + (VkPhysicalDeviceSampleLocationsPropertiesEXT *)ext; + properties->sampleLocationSampleCounts = VK_SAMPLE_COUNT_2_BIT | + VK_SAMPLE_COUNT_4_BIT | + VK_SAMPLE_COUNT_8_BIT; + properties->maxSampleLocationGridSize = (VkExtent2D){ 2 , 2 }; + properties->sampleLocationCoordinateRange[0] = 0.0f; + properties->sampleLocationCoordinateRange[1] = 1.0f; + properties->sampleLocationSubPixelBits = 4; + properties->variableSampleLocations = VK_FALSE; + break; + } default: break; } @@ -5111,3 +5124,17 @@ VkResult radv_GetCalibratedTimestampsEXT( return VK_SUCCESS; } + +void radv_GetPhysicalDeviceMultisamplePropertiesEXT( + VkPhysicalDevice physicalDevice, + VkSampleCountFlagBits samples, + VkMultisamplePropertiesEXT* pMultisampleProperties) +{ + if (samples & (VK_SAMPLE_COUNT_2_BIT | + VK_SAMPLE_COUNT_4_BIT | + VK_SAMPLE_COUNT_8_BIT)) { + pMultisampleProperties->maxSampleLocationGridSize = (VkExtent2D){ 2, 2 }; + } else { + pMultisampleProperties->maxSampleLocationGridSize = (VkExtent2D){ 0, 0 }; + } +} diff --git a/src/amd/vulkan/radv_extensions.py b/src/amd/vulkan/radv_extensions.py index d14169144f7..19b24ac4157 100644 --- a/src/amd/vulkan/radv_extensions.py +++ b/src/amd/vulkan/radv_extensions.py @@ -106,6 +106,7 @@ EXTENSIONS = [ Extension('VK_EXT_external_memory_host', 1, 'device->rad_info.has_userptr'), Extension('VK_EXT_global_priority', 1, 'device->rad_info.has_ctx_priority'), Extension('VK_EXT_pci_bus_info', 1, True), + Extension('VK_EXT_sample_locations', 1, 'device->rad_info.chip_class >= VI'), Extension('VK_EXT_sampler_filter_minmax', 1, 'device->rad_info.chip_class >= CIK'), Extension('VK_EXT_scalar_block_layout', 1, 'device->rad_info.chip_class >= CIK'), Extension('VK_EXT_shader_viewport_index_layer', 1, True), diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index 33076cc2bd2..266fdb43367 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -1276,6 +1276,8 @@ static unsigned radv_dynamic_state_mask(VkDynamicState state) return RADV_DYNAMIC_STENCIL_REFERENCE; case VK_DYNAMIC_STATE_DISCARD_RECTANGLE_EXT: return RADV_DYNAMIC_DISCARD_RECTANGLE; + case VK_DYNAMIC_STATE_SAMPLE_LOCATIONS_EXT: + return RADV_DYNAMIC_SAMPLE_LOCATIONS; default: unreachable("Unhandled dynamic state"); } @@ -1306,6 +1308,11 @@ static uint32_t radv_pipeline_needed_dynamic_state(const VkGraphicsPipelineCreat if (!vk_find_struct_const(pCreateInfo->pNext, PIPELINE_DISCARD_RECTANGLE_STATE_CREATE_INFO_EXT)) states &= ~RADV_DYNAMIC_DISCARD_RECTANGLE; + if (!pCreateInfo->pMultisampleState || + !vk_find_struct_const(pCreateInfo->pMultisampleState->pNext, + PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT)) + states &= ~RADV_DYNAMIC_SAMPLE_LOCATIONS; + /* TODO: blend constants & line width. */ return states; @@ -1442,6 +1449,29 @@ radv_pipeline_init_dynamic_state(struct radv_pipeline *pipeline, discard_rectangle_info->discardRectangleCount); } + if (states & RADV_DYNAMIC_SAMPLE_LOCATIONS) { + const VkPipelineSampleLocationsStateCreateInfoEXT *sample_location_info = + vk_find_struct_const(pCreateInfo->pMultisampleState->pNext, + PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT); + /* If sampleLocationsEnable is VK_FALSE, the default sample + * locations are used and the values specified in + * sampleLocationsInfo are ignored. + */ + if (sample_location_info->sampleLocationsEnable) { + const VkSampleLocationsInfoEXT *pSampleLocationsInfo = + &sample_location_info->sampleLocationsInfo; + + assert(pSampleLocationsInfo->sampleLocationsCount <= MAX_SAMPLE_LOCATIONS); + + dynamic->sample_location.per_pixel = pSampleLocationsInfo->sampleLocationsPerPixel; + dynamic->sample_location.grid_size = pSampleLocationsInfo->sampleLocationGridSize; + dynamic->sample_location.count = pSampleLocationsInfo->sampleLocationsCount; + typed_memcpy(&dynamic->sample_location.locations[0], + pSampleLocationsInfo->pSampleLocations, + pSampleLocationsInfo->sampleLocationsCount); + } + } + pipeline->dynamic_state.mask = states; } diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index e3dd301ee8f..4139a2911aa 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -90,6 +90,7 @@ typedef uint32_t xcb_window_t; #define MAX_VIEWPORTS 16 #define MAX_SCISSORS 16 #define MAX_DISCARD_RECTANGLES 4 +#define MAX_SAMPLE_LOCATIONS 32 #define MAX_PUSH_CONSTANTS_SIZE 128 #define MAX_PUSH_DESCRIPTORS 32 #define MAX_DYNAMIC_UNIFORM_BUFFERS 16 @@ -829,7 +830,8 @@ enum radv_dynamic_state_bits { RADV_DYNAMIC_STENCIL_WRITE_MASK = 1 << 7, RADV_DYNAMIC_STENCIL_REFERENCE = 1 << 8, RADV_DYNAMIC_DISCARD_RECTANGLE = 1 << 9, - RADV_DYNAMIC_ALL = (1 << 10) - 1, + RADV_DYNAMIC_SAMPLE_LOCATIONS = 1 << 10, + RADV_DYNAMIC_ALL = (1 << 11) - 1, }; enum radv_cmd_dirty_bits { @@ -845,12 +847,13 @@ enum radv_cmd_dirty_bits { RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK = 1 << 7, RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE = 1 << 8, RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE = 1 << 9, - RADV_CMD_DIRTY_DYNAMIC_ALL = (1 << 10) - 1, - RADV_CMD_DIRTY_PIPELINE = 1 << 10, - RADV_CMD_DIRTY_INDEX_BUFFER = 1 << 11, - RADV_CMD_DIRTY_FRAMEBUFFER = 1 << 12, - RADV_CMD_DIRTY_VERTEX_BUFFER = 1 << 13, - RADV_CMD_DIRTY_STREAMOUT_BUFFER = 1 << 14, + RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS = 1 << 10, + RADV_CMD_DIRTY_DYNAMIC_ALL = (1 << 11) - 1, + RADV_CMD_DIRTY_PIPELINE = 1 << 11, + RADV_CMD_DIRTY_INDEX_BUFFER = 1 << 12, + RADV_CMD_DIRTY_FRAMEBUFFER = 1 << 13, + RADV_CMD_DIRTY_VERTEX_BUFFER = 1 << 14, + RADV_CMD_DIRTY_STREAMOUT_BUFFER = 1 << 15, }; enum radv_cmd_flush_bits { @@ -927,6 +930,13 @@ struct radv_discard_rectangle_state { VkRect2D rectangles[MAX_DISCARD_RECTANGLES]; }; +struct radv_sample_locations_state { + VkSampleCountFlagBits per_pixel; + VkExtent2D grid_size; + uint32_t count; + VkSampleLocationEXT locations[MAX_SAMPLE_LOCATIONS]; +}; + struct radv_dynamic_state { /** * Bitmask of (1 << VK_DYNAMIC_STATE_*). @@ -969,6 +979,8 @@ struct radv_dynamic_state { } stencil_reference; struct radv_discard_rectangle_state discard_rectangle; + + struct radv_sample_locations_state sample_location; }; extern const struct radv_dynamic_state default_dynamic_state; -- 2.19.2 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev