Module: Mesa Branch: main Commit: 0dad88b4694cf82664f633187442ec65c14f3812 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=0dad88b4694cf82664f633187442ec65c14f3812
Author: Bas Nieuwenhuizen <[email protected]> Date: Tue May 18 13:25:00 2021 +0200 radv: Implement device-side BVH building. Same naive algorithm as the host build. Reviewed-by: Samuel Pitoiset <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11078> --- src/amd/vulkan/radv_acceleration_structure.c | 830 +++++++++++++++++++++++++++ src/amd/vulkan/radv_meta.c | 7 + src/amd/vulkan/radv_meta.h | 3 + src/amd/vulkan/radv_private.h | 7 + 4 files changed, 847 insertions(+) diff --git a/src/amd/vulkan/radv_acceleration_structure.c b/src/amd/vulkan/radv_acceleration_structure.c index 98ef979378d..15d1a62f085 100644 --- a/src/amd/vulkan/radv_acceleration_structure.c +++ b/src/amd/vulkan/radv_acceleration_structure.c @@ -23,6 +23,9 @@ #include "radv_private.h" #include "util/half_float.h" +#include "nir_builder.h" +#include "radv_cs.h" +#include "radv_meta.h" struct radv_accel_struct_header { uint32_t root_node_offset; @@ -589,3 +592,830 @@ radv_BuildAccelerationStructuresKHR( } return result; } + +static nir_ssa_def * +get_indices(nir_builder *b, nir_ssa_def *addr, nir_ssa_def *type, nir_ssa_def *id) +{ + const struct glsl_type *uvec3_type = glsl_vector_type(GLSL_TYPE_UINT, 3); + nir_variable *result = + nir_variable_create(b->shader, nir_var_shader_temp, uvec3_type, "indices"); + + nir_push_if(b, nir_ult(b, type, nir_imm_int(b, 2))); + nir_push_if(b, nir_ieq(b, type, nir_imm_int(b, VK_INDEX_TYPE_UINT16))); + { + nir_ssa_def *index_id = nir_umul24(b, id, nir_imm_int(b, 6)); + nir_ssa_def *indices[3]; + for (unsigned i = 0; i < 3; ++i) { + indices[i] = nir_build_load_global( + b, 1, 16, nir_iadd(b, addr, nir_u2u64(b, nir_iadd(b, index_id, nir_imm_int(b, 2 * i)))), + .align_mul = 2, .align_offset = 0); + } + nir_store_var(b, result, nir_u2u32(b, nir_vec(b, indices, 3)), 7); + } + nir_push_else(b, NULL); + { + nir_ssa_def *index_id = nir_umul24(b, id, nir_imm_int(b, 12)); + nir_ssa_def *indices = nir_build_load_global( + b, 3, 32, nir_iadd(b, addr, nir_u2u64(b, index_id)), .align_mul = 4, .align_offset = 0); + nir_store_var(b, result, indices, 7); + } + nir_pop_if(b, NULL); + nir_push_else(b, NULL); + { + nir_ssa_def *index_id = nir_umul24(b, id, nir_imm_int(b, 3)); + nir_ssa_def *indices[] = { + index_id, + nir_iadd(b, index_id, nir_imm_int(b, 1)), + nir_iadd(b, index_id, nir_imm_int(b, 2)), + }; + + nir_push_if(b, nir_ieq(b, type, nir_imm_int(b, VK_INDEX_TYPE_NONE_KHR))); + { + nir_store_var(b, result, nir_vec(b, indices, 3), 7); + } + nir_push_else(b, NULL); + { + for (unsigned i = 0; i < 3; ++i) { + indices[i] = nir_build_load_global(b, 1, 8, nir_iadd(b, addr, nir_u2u64(b, indices[i])), + .align_mul = 1, .align_offset = 0); + } + nir_store_var(b, result, nir_u2u32(b, nir_vec(b, indices, 3)), 7); + } + nir_pop_if(b, NULL); + } + nir_pop_if(b, NULL); + return nir_load_var(b, result); +} + +static void +get_vertices(nir_builder *b, nir_ssa_def *addresses, nir_ssa_def *format, nir_ssa_def *positions[3]) +{ + const struct glsl_type *vec3_type = glsl_vector_type(GLSL_TYPE_FLOAT, 3); + nir_variable *results[3] = { + nir_variable_create(b->shader, nir_var_shader_temp, vec3_type, "vertex0"), + nir_variable_create(b->shader, nir_var_shader_temp, vec3_type, "vertex1"), + nir_variable_create(b->shader, nir_var_shader_temp, vec3_type, "vertex2")}; + + VkFormat formats[] = { + VK_FORMAT_R32G32B32_SFLOAT, + VK_FORMAT_R32G32B32A32_SFLOAT, + VK_FORMAT_R16G16B16_SFLOAT, + VK_FORMAT_R16G16B16A16_SFLOAT, + }; + + for (unsigned f = 0; f < ARRAY_SIZE(formats); ++f) { + if (f + 1 < ARRAY_SIZE(formats)) + nir_push_if(b, nir_ieq(b, format, nir_imm_int(b, formats[f]))); + + for (unsigned i = 0; i < 3; ++i) { + switch (formats[f]) { + case VK_FORMAT_R32G32B32_SFLOAT: + case VK_FORMAT_R32G32B32A32_SFLOAT: + nir_store_var(b, results[i], + nir_build_load_global(b, 3, 32, nir_channel(b, addresses, i), + .align_mul = 4, .align_offset = 0), + 7); + break; + case VK_FORMAT_R16G16B16_SFLOAT: + case VK_FORMAT_R16G16B16A16_SFLOAT: { + nir_ssa_def *values[3]; + nir_ssa_def *addr = nir_channel(b, addresses, i); + for (unsigned j = 0; j < 3; ++j) + values[j] = + nir_build_load_global(b, 1, 16, nir_iadd(b, addr, nir_imm_int64(b, j * 2)), + .align_mul = 2, .align_offset = 0); + nir_store_var(b, results[i], nir_f2f32(b, nir_vec(b, values, 3)), 7); + break; + } + default: + unreachable("Unhandled format"); + } + } + if (f + 1 < ARRAY_SIZE(formats)) + nir_push_else(b, NULL); + } + for (unsigned f = 1; f < ARRAY_SIZE(formats); ++f) { + nir_pop_if(b, NULL); + } + + for (unsigned i = 0; i < 3; ++i) + positions[i] = nir_load_var(b, results[i]); +} + +struct build_primitive_constants { + uint64_t node_dst_addr; + uint64_t scratch_addr; + uint32_t dst_offset; + uint32_t dst_scratch_offset; + uint32_t geometry_type; + uint32_t geometry_id; + + union { + struct { + uint64_t vertex_addr; + uint64_t index_addr; + uint64_t transform_addr; + uint32_t vertex_stride; + uint32_t vertex_format; + uint32_t index_format; + }; + struct { + uint64_t instance_data; + }; + struct { + uint64_t aabb_addr; + uint32_t aabb_stride; + }; + }; +}; + +struct build_internal_constants { + uint64_t node_dst_addr; + uint64_t scratch_addr; + uint32_t dst_offset; + uint32_t dst_scratch_offset; + uint32_t src_scratch_offset; + uint32_t fill_header; +}; + +/* This inverts a 3x3 matrix using cofactors, as in e.g. + * https://www.mathsisfun.com/algebra/matrix-inverse-minors-cofactors-adjugate.html */ +static void +nir_invert_3x3(nir_builder *b, nir_ssa_def *in[3][3], nir_ssa_def *out[3][3]) +{ + nir_ssa_def *cofactors[3][3]; + for (unsigned i = 0; i < 3; ++i) { + for (unsigned j = 0; j < 3; ++j) { + cofactors[i][j] = + nir_fsub(b, nir_fmul(b, in[(i + 1) % 3][(j + 1) % 3], in[(i + 2) % 3][(j + 2) % 3]), + nir_fmul(b, in[(i + 1) % 3][(j + 2) % 3], in[(i + 2) % 3][(j + 1) % 3])); + } + } + + nir_ssa_def *det = NULL; + for (unsigned i = 0; i < 3; ++i) { + nir_ssa_def *det_part = nir_fmul(b, in[0][i], cofactors[0][i]); + det = det ? nir_fadd(b, det, det_part) : det_part; + } + + nir_ssa_def *det_inv = nir_frcp(b, det); + for (unsigned i = 0; i < 3; ++i) { + for (unsigned j = 0; j < 3; ++j) { + out[i][j] = nir_fmul(b, cofactors[j][i], det_inv); + } + } +} + +static nir_shader * +build_leaf_shader(struct radv_device *dev) +{ + const struct glsl_type *vec3_type = glsl_vector_type(GLSL_TYPE_FLOAT, 3); + nir_builder b = + nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL, "accel_build_leaf_shader"); + + b.shader->info.workgroup_size[0] = 64; + b.shader->info.workgroup_size[1] = 1; + b.shader->info.workgroup_size[2] = 1; + + nir_ssa_def *pconst0 = + nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 0, .range = 16); + nir_ssa_def *pconst1 = + nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 16, .range = 16); + nir_ssa_def *pconst2 = + nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 32, .range = 16); + nir_ssa_def *pconst3 = + nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 48, .range = 16); + nir_ssa_def *pconst4 = + nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 64, .range = 4); + + nir_ssa_def *geom_type = nir_channel(&b, pconst1, 2); + nir_ssa_def *node_dst_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst0, 3)); + nir_ssa_def *scratch_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst0, 12)); + nir_ssa_def *node_dst_offset = nir_channel(&b, pconst1, 0); + nir_ssa_def *scratch_offset = nir_channel(&b, pconst1, 1); + nir_ssa_def *geometry_id = nir_channel(&b, pconst1, 3); + + nir_ssa_def *global_id = + nir_iadd(&b, + nir_umul24(&b, nir_channels(&b, nir_load_workgroup_id(&b, 32), 1), + nir_imm_int(&b, b.shader->info.workgroup_size[0])), + nir_channels(&b, nir_load_local_invocation_id(&b), 1)); + scratch_addr = nir_iadd( + &b, scratch_addr, + nir_u2u64(&b, nir_iadd(&b, scratch_offset, nir_umul24(&b, global_id, nir_imm_int(&b, 4))))); + + nir_push_if(&b, nir_ieq(&b, geom_type, nir_imm_int(&b, VK_GEOMETRY_TYPE_TRIANGLES_KHR))); + { /* Triangles */ + nir_ssa_def *vertex_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst2, 3)); + nir_ssa_def *index_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst2, 12)); + nir_ssa_def *transform_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst3, 3)); + nir_ssa_def *vertex_stride = nir_channel(&b, pconst3, 2); + nir_ssa_def *vertex_format = nir_channel(&b, pconst3, 3); + nir_ssa_def *index_format = nir_channel(&b, pconst4, 0); + unsigned repl_swizzle[4] = {0, 0, 0, 0}; + + nir_ssa_def *node_offset = + nir_iadd(&b, node_dst_offset, nir_umul24(&b, global_id, nir_imm_int(&b, 64))); + nir_ssa_def *triangle_node_dst_addr = nir_iadd(&b, node_dst_addr, nir_u2u64(&b, node_offset)); + + nir_ssa_def *indices = get_indices(&b, index_addr, index_format, global_id); + nir_ssa_def *vertex_addresses = nir_iadd( + &b, nir_u2u64(&b, nir_imul(&b, indices, nir_swizzle(&b, vertex_stride, repl_swizzle, 3))), + nir_swizzle(&b, vertex_addr, repl_swizzle, 3)); + nir_ssa_def *positions[3]; + get_vertices(&b, vertex_addresses, vertex_format, positions); + + nir_ssa_def *node_data[16]; + memset(node_data, 0, sizeof(node_data)); + + nir_variable *transform[] = { + nir_variable_create(b.shader, nir_var_shader_temp, glsl_vec4_type(), "transform0"), + nir_variable_create(b.shader, nir_var_shader_temp, glsl_vec4_type(), "transform1"), + nir_variable_create(b.shader, nir_var_shader_temp, glsl_vec4_type(), "transform2"), + }; + nir_store_var(&b, transform[0], nir_imm_vec4(&b, 1.0, 0.0, 0.0, 0.0), 0xf); + nir_store_var(&b, transform[1], nir_imm_vec4(&b, 0.0, 1.0, 0.0, 0.0), 0xf); + nir_store_var(&b, transform[2], nir_imm_vec4(&b, 0.0, 0.0, 1.0, 0.0), 0xf); + + nir_push_if(&b, nir_ine(&b, transform_addr, nir_imm_int64(&b, 0))); + nir_store_var( + &b, transform[0], + nir_build_load_global(&b, 4, 32, nir_iadd(&b, transform_addr, nir_imm_int64(&b, 0)), + .align_mul = 4, .align_offset = 0), + 0xf); + nir_store_var( + &b, transform[1], + nir_build_load_global(&b, 4, 32, nir_iadd(&b, transform_addr, nir_imm_int64(&b, 16)), + .align_mul = 4, .align_offset = 0), + 0xf); + nir_store_var( + &b, transform[2], + nir_build_load_global(&b, 4, 32, nir_iadd(&b, transform_addr, nir_imm_int64(&b, 32)), + .align_mul = 4, .align_offset = 0), + 0xf); + nir_pop_if(&b, NULL); + + for (unsigned i = 0; i < 3; ++i) + for (unsigned j = 0; j < 3; ++j) + node_data[i * 3 + j] = nir_fdph(&b, positions[i], nir_load_var(&b, transform[j])); + + node_data[12] = global_id; + node_data[13] = geometry_id; + node_data[15] = nir_imm_int(&b, 9); + for (unsigned i = 0; i < ARRAY_SIZE(node_data); ++i) + if (!node_data[i]) + node_data[i] = nir_imm_int(&b, 0); + + for (unsigned i = 0; i < 4; ++i) { + nir_build_store_global(&b, nir_vec(&b, node_data + i * 4, 4), + nir_iadd(&b, triangle_node_dst_addr, nir_imm_int64(&b, i * 16)), + .write_mask = 15, .align_mul = 16, .align_offset = 0); + } + + nir_ssa_def *node_id = nir_ushr(&b, node_offset, nir_imm_int(&b, 3)); + nir_build_store_global(&b, node_id, scratch_addr, .write_mask = 1, .align_mul = 4, + .align_offset = 0); + } + nir_push_else(&b, NULL); + nir_push_if(&b, nir_ieq(&b, geom_type, nir_imm_int(&b, VK_GEOMETRY_TYPE_AABBS_KHR))); + { /* AABBs */ + nir_ssa_def *aabb_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst2, 3)); + nir_ssa_def *aabb_stride = nir_channel(&b, pconst2, 2); + + nir_ssa_def *node_offset = + nir_iadd(&b, node_dst_offset, nir_umul24(&b, global_id, nir_imm_int(&b, 64))); + nir_ssa_def *aabb_node_dst_addr = nir_iadd(&b, node_dst_addr, nir_u2u64(&b, node_offset)); + nir_ssa_def *node_id = + nir_iadd(&b, nir_ushr(&b, node_offset, nir_imm_int(&b, 3)), nir_imm_int(&b, 7)); + nir_build_store_global(&b, node_id, scratch_addr, .write_mask = 1, .align_mul = 4, + .align_offset = 0); + + aabb_addr = nir_iadd(&b, aabb_addr, nir_u2u64(&b, nir_imul(&b, aabb_stride, global_id))); + + nir_ssa_def *min_bound = + nir_build_load_global(&b, 3, 32, nir_iadd(&b, aabb_addr, nir_imm_int64(&b, 0)), + .align_mul = 4, .align_offset = 0); + nir_ssa_def *max_bound = + nir_build_load_global(&b, 3, 32, nir_iadd(&b, aabb_addr, nir_imm_int64(&b, 12)), + .align_mul = 4, .align_offset = 0); + + nir_ssa_def *values[] = {nir_channel(&b, min_bound, 0), + nir_channel(&b, min_bound, 1), + nir_channel(&b, min_bound, 2), + nir_channel(&b, max_bound, 0), + nir_channel(&b, max_bound, 1), + nir_channel(&b, max_bound, 2), + global_id, + geometry_id}; + + nir_build_store_global(&b, nir_vec(&b, values + 0, 4), + nir_iadd(&b, aabb_node_dst_addr, nir_imm_int64(&b, 0)), + .write_mask = 15, .align_mul = 16, .align_offset = 0); + nir_build_store_global(&b, nir_vec(&b, values + 4, 4), + nir_iadd(&b, aabb_node_dst_addr, nir_imm_int64(&b, 16)), + .write_mask = 15, .align_mul = 16, .align_offset = 0); + } + nir_push_else(&b, NULL); + { /* Instances */ + + nir_ssa_def *instance_addr = + nir_iadd(&b, nir_pack_64_2x32(&b, nir_channels(&b, pconst2, 3)), + nir_u2u64(&b, nir_imul(&b, global_id, nir_imm_int(&b, 64)))); + nir_ssa_def *inst_transform[] = { + nir_build_load_global(&b, 4, 32, nir_iadd(&b, instance_addr, nir_imm_int64(&b, 0)), + .align_mul = 4, .align_offset = 0), + nir_build_load_global(&b, 4, 32, nir_iadd(&b, instance_addr, nir_imm_int64(&b, 16)), + .align_mul = 4, .align_offset = 0), + nir_build_load_global(&b, 4, 32, nir_iadd(&b, instance_addr, nir_imm_int64(&b, 32)), + .align_mul = 4, .align_offset = 0)}; + nir_ssa_def *inst3 = + nir_build_load_global(&b, 4, 32, nir_iadd(&b, instance_addr, nir_imm_int64(&b, 48)), + .align_mul = 4, .align_offset = 0); + + nir_ssa_def *node_offset = + nir_iadd(&b, node_dst_offset, nir_umul24(&b, global_id, nir_imm_int(&b, 128))); + node_dst_addr = nir_iadd(&b, node_dst_addr, nir_u2u64(&b, node_offset)); + nir_ssa_def *node_id = + nir_iadd(&b, nir_ushr(&b, node_offset, nir_imm_int(&b, 3)), nir_imm_int(&b, 6)); + nir_build_store_global(&b, node_id, scratch_addr, .write_mask = 1, .align_mul = 4, + .align_offset = 0); + + nir_variable *bounds[2] = { + nir_variable_create(b.shader, nir_var_shader_temp, vec3_type, "min_bound"), + nir_variable_create(b.shader, nir_var_shader_temp, vec3_type, "max_bound"), + }; + + nir_store_var(&b, bounds[0], nir_channels(&b, nir_imm_vec4(&b, NAN, NAN, NAN, NAN), 7), 7); + nir_store_var(&b, bounds[1], nir_channels(&b, nir_imm_vec4(&b, NAN, NAN, NAN, NAN), 7), 7); + + nir_ssa_def *header_addr = nir_pack_64_2x32(&b, nir_channels(&b, inst3, 12)); + nir_push_if(&b, nir_ine(&b, header_addr, nir_imm_int64(&b, 0))); + nir_ssa_def *header_root_offset = + nir_build_load_global(&b, 1, 32, nir_iadd(&b, header_addr, nir_imm_int64(&b, 0)), + .align_mul = 4, .align_offset = 0); + nir_ssa_def *header_min = + nir_build_load_global(&b, 3, 32, nir_iadd(&b, header_addr, nir_imm_int64(&b, 8)), + .align_mul = 4, .align_offset = 0); + nir_ssa_def *header_max = + nir_build_load_global(&b, 3, 32, nir_iadd(&b, header_addr, nir_imm_int64(&b, 20)), + .align_mul = 4, .align_offset = 0); + + nir_ssa_def *bound_defs[2][3]; + for (unsigned i = 0; i < 3; ++i) { + bound_defs[0][i] = bound_defs[1][i] = nir_channel(&b, inst_transform[i], 3); + + nir_ssa_def *mul_a = nir_fmul(&b, nir_channels(&b, inst_transform[i], 7), header_min); + nir_ssa_def *mul_b = nir_fmul(&b, nir_channels(&b, inst_transform[i], 7), header_max); + nir_ssa_def *mi = nir_fmin(&b, mul_a, mul_b); + nir_ssa_def *ma = nir_fmax(&b, mul_a, mul_b); + for (unsigned j = 0; j < 3; ++j) { + bound_defs[0][i] = nir_fadd(&b, bound_defs[0][i], nir_channel(&b, mi, j)); + bound_defs[1][i] = nir_fadd(&b, bound_defs[1][i], nir_channel(&b, ma, j)); + } + } + + nir_store_var(&b, bounds[0], nir_vec(&b, bound_defs[0], 3), 7); + nir_store_var(&b, bounds[1], nir_vec(&b, bound_defs[1], 3), 7); + + nir_ssa_def *m_in[3][3], *m_out[3][3], *m_vec[3][4]; + for (unsigned i = 0; i < 3; ++i) + for (unsigned j = 0; j < 3; ++j) + m_in[i][j] = nir_channel(&b, inst_transform[i], j); + nir_invert_3x3(&b, m_in, m_out); + for (unsigned i = 0; i < 3; ++i) { + for (unsigned j = 0; j < 3; ++j) + m_vec[i][j] = m_out[i][j]; + m_vec[i][3] = nir_channel(&b, inst_transform[i], 3); + } + + for (unsigned i = 0; i < 3; ++i) { + nir_build_store_global(&b, nir_vec(&b, m_vec[i], 4), + nir_iadd(&b, node_dst_addr, nir_imm_int64(&b, 16 + 16 * i)), + .write_mask = 0xf, .align_mul = 4, .align_offset = 0); + } + + nir_ssa_def *out0[4] = { + nir_ior(&b, nir_channel(&b, nir_unpack_64_2x32(&b, header_addr), 0), header_root_offset), + nir_channel(&b, nir_unpack_64_2x32(&b, header_addr), 1), nir_channel(&b, inst3, 0), + nir_channel(&b, inst3, 1)}; + nir_build_store_global(&b, nir_vec(&b, out0, 4), + nir_iadd(&b, node_dst_addr, nir_imm_int64(&b, 0)), .write_mask = 0xf, + .align_mul = 4, .align_offset = 0); + nir_build_store_global(&b, global_id, nir_iadd(&b, node_dst_addr, nir_imm_int64(&b, 88)), + .write_mask = 0x1, .align_mul = 4, .align_offset = 0); + nir_pop_if(&b, NULL); + nir_build_store_global(&b, nir_load_var(&b, bounds[0]), + nir_iadd(&b, node_dst_addr, nir_imm_int64(&b, 64)), .write_mask = 0x7, + .align_mul = 4, .align_offset = 0); + nir_build_store_global(&b, nir_load_var(&b, bounds[1]), + nir_iadd(&b, node_dst_addr, nir_imm_int64(&b, 76)), .write_mask = 0x7, + .align_mul = 4, .align_offset = 0); + } + nir_pop_if(&b, NULL); + nir_pop_if(&b, NULL); + + return b.shader; +} + +static void +determine_bounds(nir_builder *b, nir_ssa_def *node_addr, nir_ssa_def *node_id, + nir_variable *bounds_vars[2]) +{ + nir_ssa_def *node_type = nir_iand(b, node_id, nir_imm_int(b, 7)); + node_addr = nir_iadd( + b, node_addr, + nir_u2u64(b, nir_ishl(b, nir_iand(b, node_id, nir_imm_int(b, ~7u)), nir_imm_int(b, 3)))); + + nir_push_if(b, nir_ieq(b, node_type, nir_imm_int(b, 0))); + { + nir_ssa_def *positions[3]; + for (unsigned i = 0; i < 3; ++i) + positions[i] = + nir_build_load_global(b, 3, 32, nir_iadd(b, node_addr, nir_imm_int64(b, i * 12)), + .align_mul = 4, .align_offset = 0); + nir_ssa_def *bounds[] = {positions[0], positions[0]}; + for (unsigned i = 1; i < 3; ++i) { + bounds[0] = nir_fmin(b, bounds[0], positions[i]); + bounds[1] = nir_fmax(b, bounds[1], positions[i]); + } + nir_store_var(b, bounds_vars[0], bounds[0], 7); + nir_store_var(b, bounds_vars[1], bounds[1], 7); + } + nir_push_else(b, NULL); + nir_push_if(b, nir_ieq(b, node_type, nir_imm_int(b, 5))); + { + nir_ssa_def *input_bounds[4][2]; + for (unsigned i = 0; i < 4; ++i) + for (unsigned j = 0; j < 2; ++j) + input_bounds[i][j] = nir_build_load_global( + b, 3, 32, nir_iadd(b, node_addr, nir_imm_int64(b, 16 + i * 24 + j * 12)), + .align_mul = 4, .align_offset = 0); + nir_ssa_def *bounds[] = {input_bounds[0][0], input_bounds[0][1]}; + for (unsigned i = 1; i < 4; ++i) { + bounds[0] = nir_fmin(b, bounds[0], input_bounds[i][0]); + bounds[1] = nir_fmax(b, bounds[1], input_bounds[i][1]); + } + + nir_store_var(b, bounds_vars[0], bounds[0], 7); + nir_store_var(b, bounds_vars[1], bounds[1], 7); + } + nir_push_else(b, NULL); + nir_push_if(b, nir_ieq(b, node_type, nir_imm_int(b, 6))); + { /* Instances */ + nir_ssa_def *bounds[2]; + for (unsigned i = 0; i < 2; ++i) + bounds[i] = + nir_build_load_global(b, 3, 32, nir_iadd(b, node_addr, nir_imm_int64(b, 64 + i * 12)), + .align_mul = 4, .align_offset = 0); + nir_store_var(b, bounds_vars[0], bounds[0], 7); + nir_store_var(b, bounds_vars[1], bounds[1], 7); + } + nir_push_else(b, NULL); + { /* AABBs */ + nir_ssa_def *bounds[2]; + for (unsigned i = 0; i < 2; ++i) + bounds[i] = + nir_build_load_global(b, 3, 32, nir_iadd(b, node_addr, nir_imm_int64(b, i * 12)), + .align_mul = 4, .align_offset = 0); + nir_store_var(b, bounds_vars[0], bounds[0], 7); + nir_store_var(b, bounds_vars[1], bounds[1], 7); + } + nir_pop_if(b, NULL); + nir_pop_if(b, NULL); + nir_pop_if(b, NULL); +} + +static nir_shader * +build_internal_shader(struct radv_device *dev) +{ + const struct glsl_type *vec3_type = glsl_vector_type(GLSL_TYPE_FLOAT, 3); + nir_builder b = + nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL, "accel_build_internal_shader"); + + b.shader->info.workgroup_size[0] = 64; + b.shader->info.workgroup_size[1] = 1; + b.shader->info.workgroup_size[2] = 1; + + /* + * push constants: + * i32 x 2: node dst address + * i32 x 2: scratch address + * i32: dst offset + * i32: dst scratch offset + * i32: src scratch offset + * i32: src_node_count | (fill_header << 31) + */ + nir_ssa_def *pconst0 = + nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 0, .range = 16); + nir_ssa_def *pconst1 = + nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 16, .range = 16); + + nir_ssa_def *node_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst0, 3)); + nir_ssa_def *scratch_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst0, 12)); + nir_ssa_def *node_dst_offset = nir_channel(&b, pconst1, 0); + nir_ssa_def *dst_scratch_offset = nir_channel(&b, pconst1, 1); + nir_ssa_def *src_scratch_offset = nir_channel(&b, pconst1, 2); + nir_ssa_def *src_node_count = + nir_iand(&b, nir_channel(&b, pconst1, 3), nir_imm_int(&b, 0x7FFFFFFFU)); + nir_ssa_def *fill_header = + nir_ine(&b, nir_iand(&b, nir_channel(&b, pconst1, 3), nir_imm_int(&b, 0x80000000U)), + nir_imm_int(&b, 0)); + + nir_ssa_def *global_id = + nir_iadd(&b, + nir_umul24(&b, nir_channels(&b, nir_load_workgroup_id(&b, 32), 1), + nir_imm_int(&b, b.shader->info.workgroup_size[0])), + nir_channels(&b, nir_load_local_invocation_id(&b), 1)); + nir_ssa_def *src_idx = nir_imul(&b, global_id, nir_imm_int(&b, 4)); + nir_ssa_def *src_count = nir_umin(&b, nir_imm_int(&b, 4), nir_isub(&b, src_node_count, src_idx)); + + nir_ssa_def *node_offset = + nir_iadd(&b, node_dst_offset, nir_ishl(&b, global_id, nir_imm_int(&b, 7))); + nir_ssa_def *node_dst_addr = nir_iadd(&b, node_addr, nir_u2u64(&b, node_offset)); + nir_ssa_def *src_nodes = nir_build_load_global( + &b, 4, 32, + nir_iadd(&b, scratch_addr, + nir_u2u64(&b, nir_iadd(&b, src_scratch_offset, + nir_ishl(&b, global_id, nir_imm_int(&b, 4))))), + .align_mul = 4, .align_offset = 0); + + nir_build_store_global(&b, src_nodes, nir_iadd(&b, node_dst_addr, nir_imm_int64(&b, 0)), + .write_mask = 0xf, .align_mul = 4, .align_offset = 0); + + nir_ssa_def *total_bounds[2] = { + nir_channels(&b, nir_imm_vec4(&b, NAN, NAN, NAN, NAN), 7), + nir_channels(&b, nir_imm_vec4(&b, NAN, NAN, NAN, NAN), 7), + }; + + for (unsigned i = 0; i < 4; ++i) { + nir_variable *bounds[2] = { + nir_variable_create(b.shader, nir_var_shader_temp, vec3_type, "min_bound"), + nir_variable_create(b.shader, nir_var_shader_temp, vec3_type, "max_bound"), + }; + nir_store_var(&b, bounds[0], nir_channels(&b, nir_imm_vec4(&b, NAN, NAN, NAN, NAN), 7), 7); + nir_store_var(&b, bounds[1], nir_channels(&b, nir_imm_vec4(&b, NAN, NAN, NAN, NAN), 7), 7); + + nir_push_if(&b, nir_ilt(&b, nir_imm_int(&b, i), src_count)); + determine_bounds(&b, node_addr, nir_channel(&b, src_nodes, i), bounds); + nir_pop_if(&b, NULL); + nir_build_store_global(&b, nir_load_var(&b, bounds[0]), + nir_iadd(&b, node_dst_addr, nir_imm_int64(&b, 16 + 24 * i)), + .write_mask = 0x7, .align_mul = 4, .align_offset = 0); + nir_build_store_global(&b, nir_load_var(&b, bounds[1]), + nir_iadd(&b, node_dst_addr, nir_imm_int64(&b, 28 + 24 * i)), + .write_mask = 0x7, .align_mul = 4, .align_offset = 0); + total_bounds[0] = nir_fmin(&b, total_bounds[0], nir_load_var(&b, bounds[0])); + total_bounds[1] = nir_fmax(&b, total_bounds[1], nir_load_var(&b, bounds[1])); + } + + nir_ssa_def *node_id = + nir_iadd(&b, nir_ushr(&b, node_offset, nir_imm_int(&b, 3)), nir_imm_int(&b, 5)); + nir_ssa_def *dst_scratch_addr = nir_iadd( + &b, scratch_addr, + nir_u2u64(&b, nir_iadd(&b, dst_scratch_offset, nir_ishl(&b, global_id, nir_imm_int(&b, 2))))); + nir_build_store_global(&b, node_id, dst_scratch_addr, .write_mask = 1, .align_mul = 4, + .align_offset = 0); + + nir_push_if(&b, fill_header); + nir_build_store_global(&b, node_id, node_addr, .write_mask = 1, .align_mul = 4, + .align_offset = 0); + nir_build_store_global(&b, total_bounds[0], nir_iadd(&b, node_addr, nir_imm_int64(&b, 8)), + .write_mask = 7, .align_mul = 4, .align_offset = 0); + nir_build_store_global(&b, total_bounds[1], nir_iadd(&b, node_addr, nir_imm_int64(&b, 20)), + .write_mask = 7, .align_mul = 4, .align_offset = 0); + nir_pop_if(&b, NULL); + return b.shader; +} + +void +radv_device_finish_accel_struct_build_state(struct radv_device *device) +{ + struct radv_meta_state *state = &device->meta_state; + radv_DestroyPipeline(radv_device_to_handle(device), state->accel_struct_build.internal_pipeline, + &state->alloc); + radv_DestroyPipeline(radv_device_to_handle(device), state->accel_struct_build.leaf_pipeline, + &state->alloc); + radv_DestroyPipelineLayout(radv_device_to_handle(device), + state->accel_struct_build.internal_p_layout, &state->alloc); + radv_DestroyPipelineLayout(radv_device_to_handle(device), + state->accel_struct_build.leaf_p_layout, &state->alloc); +} + +VkResult +radv_device_init_accel_struct_build_state(struct radv_device *device) +{ + VkResult result; + nir_shader *leaf_cs = build_leaf_shader(device); + nir_shader *internal_cs = build_internal_shader(device); + + const VkPipelineLayoutCreateInfo leaf_pl_create_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .setLayoutCount = 0, + .pushConstantRangeCount = 1, + .pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, + sizeof(struct build_primitive_constants)}, + }; + + result = radv_CreatePipelineLayout(radv_device_to_handle(device), &leaf_pl_create_info, + &device->meta_state.alloc, + &device->meta_state.accel_struct_build.leaf_p_layout); + if (result != VK_SUCCESS) + goto fail; + + VkPipelineShaderStageCreateInfo leaf_shader_stage = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_COMPUTE_BIT, + .module = vk_shader_module_handle_from_nir(leaf_cs), + .pName = "main", + .pSpecializationInfo = NULL, + }; + + VkComputePipelineCreateInfo leaf_pipeline_info = { + .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, + .stage = leaf_shader_stage, + .flags = 0, + .layout = device->meta_state.accel_struct_build.leaf_p_layout, + }; + + result = radv_CreateComputePipelines( + radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1, + &leaf_pipeline_info, NULL, &device->meta_state.accel_struct_build.leaf_pipeline); + if (result != VK_SUCCESS) + goto fail; + + const VkPipelineLayoutCreateInfo internal_pl_create_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .setLayoutCount = 0, + .pushConstantRangeCount = 1, + .pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, + sizeof(struct build_internal_constants)}, + }; + + result = radv_CreatePipelineLayout(radv_device_to_handle(device), &internal_pl_create_info, + &device->meta_state.alloc, + &device->meta_state.accel_struct_build.internal_p_layout); + if (result != VK_SUCCESS) + goto fail; + + VkPipelineShaderStageCreateInfo internal_shader_stage = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_COMPUTE_BIT, + .module = vk_shader_module_handle_from_nir(internal_cs), + .pName = "main", + .pSpecializationInfo = NULL, + }; + + VkComputePipelineCreateInfo internal_pipeline_info = { + .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, + .stage = internal_shader_stage, + .flags = 0, + .layout = device->meta_state.accel_struct_build.internal_p_layout, + }; + + result = radv_CreateComputePipelines( + radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1, + &internal_pipeline_info, NULL, &device->meta_state.accel_struct_build.internal_pipeline); + if (result != VK_SUCCESS) + goto fail; + + return VK_SUCCESS; + +fail: + radv_device_finish_accel_struct_build_state(device); + ralloc_free(internal_cs); + ralloc_free(leaf_cs); + return result; +} + +struct bvh_state { + uint32_t node_offset; + uint32_t node_count; + uint32_t scratch_offset; +}; + +void +radv_CmdBuildAccelerationStructuresKHR( + VkCommandBuffer commandBuffer, uint32_t infoCount, + const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, + const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + struct radv_meta_saved_state saved_state; + + radv_meta_save( + &saved_state, cmd_buffer, + RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_DESCRIPTORS | RADV_META_SAVE_CONSTANTS); + struct bvh_state *bvh_states = calloc(infoCount, sizeof(struct bvh_state)); + + radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, + cmd_buffer->device->meta_state.accel_struct_build.leaf_pipeline); + + for (uint32_t i = 0; i < infoCount; ++i) { + RADV_FROM_HANDLE(radv_acceleration_structure, accel_struct, + pInfos[i].dstAccelerationStructure); + + struct build_primitive_constants prim_consts = { + .node_dst_addr = radv_accel_struct_get_va(accel_struct), + .scratch_addr = pInfos[i].scratchData.deviceAddress, + .dst_offset = ALIGN(sizeof(struct radv_accel_struct_header), 64) + 128, + .dst_scratch_offset = 0, + }; + + for (unsigned j = 0; j < pInfos[i].geometryCount; ++j) { + const VkAccelerationStructureGeometryKHR *geom = + pInfos[i].pGeometries ? &pInfos[i].pGeometries[j] : pInfos[i].ppGeometries[j]; + + prim_consts.geometry_type = geom->geometryType; + prim_consts.geometry_id = j | (geom->flags << 28); + unsigned prim_size; + switch (geom->geometryType) { + case VK_GEOMETRY_TYPE_TRIANGLES_KHR: + prim_consts.vertex_addr = + geom->geometry.triangles.vertexData.deviceAddress + + ppBuildRangeInfos[i][j].firstVertex * geom->geometry.triangles.vertexStride + + (geom->geometry.triangles.indexType != VK_INDEX_TYPE_NONE_KHR + ? ppBuildRangeInfos[i][j].primitiveOffset + : 0); + prim_consts.index_addr = geom->geometry.triangles.indexData.deviceAddress + + ppBuildRangeInfos[i][j].primitiveOffset; + prim_consts.transform_addr = geom->geometry.triangles.transformData.deviceAddress + + ppBuildRangeInfos[i][j].transformOffset; + prim_consts.vertex_stride = geom->geometry.triangles.vertexStride; + prim_consts.vertex_format = geom->geometry.triangles.vertexFormat; + prim_consts.index_format = geom->geometry.triangles.indexType; + prim_size = 64; + break; + case VK_GEOMETRY_TYPE_AABBS_KHR: + prim_consts.aabb_addr = + geom->geometry.aabbs.data.deviceAddress + ppBuildRangeInfos[i][j].primitiveOffset; + prim_consts.aabb_stride = geom->geometry.aabbs.stride; + prim_size = 64; + break; + case VK_GEOMETRY_TYPE_INSTANCES_KHR: + prim_consts.instance_data = geom->geometry.instances.data.deviceAddress; + prim_size = 128; + break; + default: + unreachable("Unknown geometryType"); + } + + radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), + cmd_buffer->device->meta_state.accel_struct_build.leaf_p_layout, + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(prim_consts), &prim_consts); + radv_unaligned_dispatch(cmd_buffer, ppBuildRangeInfos[i][j].primitiveCount, 1, 1); + prim_consts.dst_offset += prim_size * ppBuildRangeInfos[i][j].primitiveCount; + prim_consts.dst_scratch_offset += 4 * ppBuildRangeInfos[i][j].primitiveCount; + } + bvh_states[i].node_offset = prim_consts.dst_offset; + bvh_states[i].node_count = prim_consts.dst_scratch_offset / 4; + } + + radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, + cmd_buffer->device->meta_state.accel_struct_build.internal_pipeline); + bool progress = true; + for (unsigned iter = 0; progress; ++iter) { + progress = false; + for (uint32_t i = 0; i < infoCount; ++i) { + RADV_FROM_HANDLE(radv_acceleration_structure, accel_struct, + pInfos[i].dstAccelerationStructure); + + if (iter && bvh_states[i].node_count == 1) + continue; + + if (!progress) { + cmd_buffer->state.flush_bits |= + RADV_CMD_FLAG_CS_PARTIAL_FLUSH | + radv_src_access_flush(cmd_buffer, VK_ACCESS_SHADER_WRITE_BIT, NULL) | + radv_dst_access_flush(cmd_buffer, + VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, NULL); + } + progress = true; + uint32_t dst_node_count = MAX2(1, DIV_ROUND_UP(bvh_states[i].node_count, 4)); + bool final_iter = dst_node_count == 1; + uint32_t src_scratch_offset = bvh_states[i].scratch_offset; + uint32_t dst_scratch_offset = src_scratch_offset ? 0 : bvh_states[i].node_count * 4; + uint32_t dst_node_offset = bvh_states[i].node_offset; + if (final_iter) + dst_node_offset = ALIGN(sizeof(struct radv_accel_struct_header), 64); + + const struct build_internal_constants consts = { + .node_dst_addr = radv_accel_struct_get_va(accel_struct), + .scratch_addr = pInfos[i].scratchData.deviceAddress, + .dst_offset = dst_node_offset, + .dst_scratch_offset = dst_scratch_offset, + .src_scratch_offset = src_scratch_offset, + .fill_header = bvh_states[i].node_count | (final_iter ? 0x80000000U : 0), + }; + + radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), + cmd_buffer->device->meta_state.accel_struct_build.internal_p_layout, + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts); + radv_unaligned_dispatch(cmd_buffer, dst_node_count, 1, 1); + bvh_states[i].node_offset += dst_node_count * 128; + bvh_states[i].node_count = dst_node_count; + bvh_states[i].scratch_offset = dst_scratch_offset; + } + } + free(bvh_states); + radv_meta_restore(&saved_state, cmd_buffer); +} \ No newline at end of file diff --git a/src/amd/vulkan/radv_meta.c b/src/amd/vulkan/radv_meta.c index 8416e9adfda..b54e4bd183c 100644 --- a/src/amd/vulkan/radv_meta.c +++ b/src/amd/vulkan/radv_meta.c @@ -474,8 +474,14 @@ radv_device_init_meta(struct radv_device *device) if (result != VK_SUCCESS) goto fail_fmask_expand; + result = radv_device_init_accel_struct_build_state(device); + if (result != VK_SUCCESS) + goto fail_accel_struct_build; + return VK_SUCCESS; +fail_accel_struct_build: + radv_device_finish_meta_fmask_expand_state(device); fail_fmask_expand: radv_device_finish_meta_resolve_fragment_state(device); fail_resolve_fragment: @@ -507,6 +513,7 @@ fail_clear: void radv_device_finish_meta(struct radv_device *device) { + radv_device_finish_accel_struct_build_state(device); radv_device_finish_meta_clear_state(device); radv_device_finish_meta_resolve_state(device); radv_device_finish_meta_blit_state(device); diff --git a/src/amd/vulkan/radv_meta.h b/src/amd/vulkan/radv_meta.h index c827baf969c..d04da9d3a2b 100644 --- a/src/amd/vulkan/radv_meta.h +++ b/src/amd/vulkan/radv_meta.h @@ -133,6 +133,9 @@ void radv_device_finish_meta_dcc_retile_state(struct radv_device *device); void radv_device_finish_meta_copy_vrs_htile_state(struct radv_device *device); +VkResult radv_device_init_accel_struct_build_state(struct radv_device *device); +void radv_device_finish_accel_struct_build_state(struct radv_device *device); + void radv_meta_save(struct radv_meta_saved_state *saved_state, struct radv_cmd_buffer *cmd_buffer, uint32_t flags); diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index b5abef0827d..bf99e928b9f 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -661,6 +661,13 @@ struct radv_meta_state { VkPipelineLayout p_layout; VkPipeline pipeline; } dcc_retile; + + struct { + VkPipelineLayout leaf_p_layout; + VkPipeline leaf_pipeline; + VkPipelineLayout internal_p_layout; + VkPipeline internal_pipeline; + } accel_struct_build; }; /* queue types */ _______________________________________________ mesa-commit mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/mesa-commit
