Commit: be785ef4f78e3dd7d51963f249177b1c2a8dc5d6 Author: Brecht Van Lommel Date: Thu May 6 20:06:57 2021 +0200 Branches: cycles-x https://developer.blender.org/rBbe785ef4f78e3dd7d51963f249177b1c2a8dc5d6
Cycles X: refactoring of kernel globals * Declare kernel globals and associated macros in one place * Add IntegratorStateGPU for CPU host to access * Reduce code duplication between CUDA and OptiX * Make shader sort key part of integrator state template * Remove unused CPU kernel globals * Prepare for copying/moving states * Rename integrator_path_state.h to integrator_state_flow.h, to avoid confusion with kernel_path_state.h. Part of these changes were implemented by Sergey for D11172. Differential Revision: https://developer.blender.org/D11185 =================================================================== M intern/cycles/device/cpu/kernel_thread_globals.cpp M intern/cycles/device/optix/device_impl.cpp M intern/cycles/device/optix/device_impl.h M intern/cycles/device/optix/queue.cpp M intern/cycles/integrator/path_trace_work_gpu.cpp M intern/cycles/integrator/path_trace_work_gpu.h M intern/cycles/kernel/CMakeLists.txt M intern/cycles/kernel/device/cpu/compat.h M intern/cycles/kernel/device/cpu/globals.h M intern/cycles/kernel/device/cpu/kernel_arch_impl.h M intern/cycles/kernel/device/cuda/compat.h M intern/cycles/kernel/device/cuda/globals.h M intern/cycles/kernel/device/cuda/kernel.cu M intern/cycles/kernel/device/cuda/parallel_sorted_index.h M intern/cycles/kernel/device/optix/compat.h M intern/cycles/kernel/device/optix/globals.h M intern/cycles/kernel/device/optix/kernel.cu D intern/cycles/kernel/integrator/integrator_path_state.h M intern/cycles/kernel/integrator/integrator_state.h A intern/cycles/kernel/integrator/integrator_state_flow.h M intern/cycles/kernel/integrator/integrator_state_template.h =================================================================== diff --git a/intern/cycles/device/cpu/kernel_thread_globals.cpp b/intern/cycles/device/cpu/kernel_thread_globals.cpp index eff15af5ff1..f0089e34a7a 100644 --- a/intern/cycles/device/cpu/kernel_thread_globals.cpp +++ b/intern/cycles/device/cpu/kernel_thread_globals.cpp @@ -23,22 +23,6 @@ CCL_NAMESPACE_BEGIN -/* TODO(sergey): Consider making more available function. Maybe `util_memory.h`? */ -static void safe_free(void *mem) -{ - if (mem == nullptr) { - return; - } - free(mem); -} - -/* Get number of elements in a bound array. */ -/* TODO(sergey): Make this function more re-usable. */ -template<class T, int N> constexpr inline int ARRAY_SIZE(T (&/*array*/)[N]) -{ - return N; -} - CPUKernelThreadGlobals::CPUKernelThreadGlobals() { reset_runtime_memory(); @@ -50,7 +34,6 @@ CPUKernelThreadGlobals::CPUKernelThreadGlobals(const KernelGlobals &kernel_globa { reset_runtime_memory(); - decoupled_volume_steps_index = 0; coverage_asset = nullptr; coverage_object = nullptr; coverage_material = nullptr; @@ -70,12 +53,6 @@ CPUKernelThreadGlobals::CPUKernelThreadGlobals(CPUKernelThreadGlobals &&other) n CPUKernelThreadGlobals::~CPUKernelThreadGlobals() { - safe_free(transparent_shadow_intersections); - - const int decoupled_count = ARRAY_SIZE(decoupled_volume_steps); - for (int i = 0; i < decoupled_count; ++i) { - safe_free(decoupled_volume_steps[i]); - } #ifdef WITH_OSL OSLShader::thread_free(this); #endif @@ -96,13 +73,9 @@ CPUKernelThreadGlobals &CPUKernelThreadGlobals::operator=(CPUKernelThreadGlobals void CPUKernelThreadGlobals::reset_runtime_memory() { - transparent_shadow_intersections = nullptr; - #ifdef WITH_OSL osl = nullptr; #endif - - memset(decoupled_volume_steps, 0, sizeof(decoupled_volume_steps)); } CCL_NAMESPACE_END diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp index c0ee4f143b8..5843d88ce88 100644 --- a/intern/cycles/device/optix/device_impl.cpp +++ b/intern/cycles/device/optix/device_impl.cpp @@ -97,7 +97,7 @@ OptiXDevice::OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profile # endif /* Fix weird compiler bug that assigns wrong size. */ - launch_params.data_elements = sizeof(KernelParams); + launch_params.data_elements = sizeof(KernelParamsOptiX); /* Allocate launch parameter buffer memory on device. */ launch_params.alloc_to_device(1); @@ -1387,20 +1387,17 @@ void OptiXDevice::const_copy_to(const char *name, void *host, size_t size) KernelData *const data = (KernelData *)host; *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle; - update_launch_params(offsetof(KernelParams, data), host, size); + update_launch_params(offsetof(KernelParamsOptiX, data), host, size); return; } /* Update data storage pointers in launch parameters. */ # define KERNEL_TEX(data_type, tex_name) \ if (strcmp(name, #tex_name) == 0) { \ - update_launch_params(offsetof(KernelParams, tex_name), host, size); \ + update_launch_params(offsetof(KernelParamsOptiX, tex_name), host, size); \ return; \ } KERNEL_TEX(IntegratorState, __integrator_state) - KERNEL_TEX(IntegratorQueueCounter *, __integrator_queue_counter) - KERNEL_TEX(int *, __integrator_sort_key) - KERNEL_TEX(int *, __integrator_sort_key_counter) # include "kernel/kernel_textures.h" # undef KERNEL_TEX } diff --git a/intern/cycles/device/optix/device_impl.h b/intern/cycles/device/optix/device_impl.h index ba606075c79..a4b75a16354 100644 --- a/intern/cycles/device/optix/device_impl.h +++ b/intern/cycles/device/optix/device_impl.h @@ -26,7 +26,7 @@ CCL_NAMESPACE_BEGIN class BVHOptiX; -struct KernelParams; +struct KernelParamsOptiX; /* List of OptiX program groups. */ enum { @@ -64,7 +64,7 @@ class OptiXDevice : public CUDADevice { bool motion_blur = false; device_vector<SbtRecord> sbt_data; - device_only_memory<KernelParams> launch_params; + device_only_memory<KernelParamsOptiX> launch_params; OptixTraversableHandle tlas_handle = 0; class Denoiser { diff --git a/intern/cycles/device/optix/queue.cpp b/intern/cycles/device/optix/queue.cpp index 211df631bcb..59203dedb35 100644 --- a/intern/cycles/device/optix/queue.cpp +++ b/intern/cycles/device/optix/queue.cpp @@ -69,16 +69,17 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *a cuda_device_assert( cuda_device_, - cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, path_index_array), + cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, path_index_array), args[0], // &d_path_index sizeof(device_ptr), cuda_stream_)); if (kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) { - cuda_device_assert(cuda_device_, - cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, render_buffer), - args[1], // &d_render_buffer - sizeof(device_ptr), - cuda_stream_)); + cuda_device_assert( + cuda_device_, + cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, render_buffer), + args[1], // &d_render_buffer + sizeof(device_ptr), + cuda_stream_)); } cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_)); diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp index 8991bfa0c63..0289c634ae6 100644 --- a/intern/cycles/integrator/path_trace_work_gpu.cpp +++ b/intern/cycles/integrator/path_trace_work_gpu.cpp @@ -38,7 +38,6 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device, queue_(device->gpu_queue_create()), render_buffers_(buffers), integrator_queue_counter_(device, "integrator_queue_counter", MEM_READ_WRITE), - integrator_sort_key_(device, "integrator_sort_key", MEM_READ_WRITE), integrator_sort_key_counter_(device, "integrator_sort_key_counter", MEM_READ_WRITE), queued_paths_(device, "queued_paths", MEM_READ_WRITE), num_queued_paths_(device, "num_queued_paths", MEM_READ_WRITE), @@ -47,36 +46,42 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device, max_num_paths_(queue_->num_concurrent_states(sizeof(IntegratorState))), max_active_path_index_(0) { + memset(&integrator_state_gpu_, 0, sizeof(integrator_state_gpu_)); work_tile_scheduler_.set_max_num_path_states(max_num_paths_); } -void PathTraceWorkGPU::alloc_integrator_state() +void PathTraceWorkGPU::alloc_integrator_soa() { /* IntegrateState allocated as structure of arrays. * * Allocate a device only memory buffer before for each struct member, and then * write the pointers into a struct that resides in constant memory. * - * This assumes the device side struct memory contains consecutive pointers for - * each struct member, with the same 64-bit size as device_ptr. - * - * TODO: store float3 in separate XYZ arrays. */ + * TODO: store float3 in separate XYZ arrays. + * TODO: skip zeroing most arrays and leave uninitialized. */ + if (!integrator_state_soa_.empty()) { return; } - vector<device_ptr> device_struct; - #define KERNEL_STRUCT_BEGIN(name) for (int array_index = 0;; array_index++) { -#define KERNEL_STRUCT_MEMBER(type, name) \ +#define KERNEL_STRUCT_MEMBER(parent_struct, type, name) \ + { \ + device_only_memory<type> *array = new device_only_memory<type>(device_, \ + "integrator_state_" #name); \ + array->alloc_to_device(max_num_paths_); \ + array->zero_to_device(); \ + integrator_state_soa_.emplace_back(array); \ + integrator_state_gpu_.parent_struct.name = (type *)array->device_pointer; \ + } +#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name) \ { \ device_only_memory<type> *array = new device_only_memory<type>(device_, \ "integrator_state_" #name); \ array->alloc_to_device(max_num_paths_); \ - /* TODO: skip for most arrays. */ \ array->zero_to_device(); \ - device_struct.push_back(array->device_pointer); \ integrator_state_soa_.emplace_back(array); \ + integrator_state_gpu_.parent_struct[array_index].name = (type *)array->device_pointer; \ } #define KERNEL_STRUCT_END(name) \ break; \ @@ -89,12 +94,9 @@ void PathTraceWorkGPU::alloc_integrator_state() #include "kernel/integrator/integrator_state_template.h" #undef KERNEL_STRUCT_BEGIN #undef KERNEL_STRUCT_MEMBER +#undef KERNEL_STRUCT_ARRAY_MEMBER #undef KERNEL_STRUCT_END #undef KERNEL_STRUCT_END_ARRAY - - /* Copy to device side struct in constant memory. */ - device_->const_copy_to( - "__integrator_state", device_struct.data(), device_struct.size() * sizeof(device_ptr)); } void PathTraceWorkGPU::alloc_integrator_queue() @@ -103,11 +105,8 @@ void PathTraceWorkGPU::alloc_integrator_queue() integrator_queue_counter_.alloc(1); integrator_queue_counter_.zero_to_device(); integrator_queue_counter_.copy_from_device(); - - /* Copy to device side pointer in constant memory. */ - device_->const_copy_to("__integrator_queue_counter", - &integrator_queue_counter_.device_pointer, - sizeof(device_ptr)); + integrator_state_gpu_.queue_counter = (IntegratorQueueCounter *) + integrator_queue_counter_.device_pointer; } /* Allocate data for active path index arrays. */ @@ -126,21 +125,11 @@ void PathTraceWorkGPU::alloc_integrator_queue() void PathTraceWorkGPU::alloc_integrator_sorting() { /* Allocate arrays for shader sorting. */ - if (integrator_sort_key_counter_.size() == 0) { - integrator_sort_key_.alloc(max_num_paths_); - /* TODO: this could be skip if we had a function to just allocate on device. */ - integrator_sort_key_.zero_to_device(); - device_->const_copy_to( - "__integrator_sort_key", &integrator_ @@ Diff output truncated at 10240 characters. @@ _______________________________________________ Bf-blender-cvs mailing list [email protected] https://lists.blender.org/mailman/listinfo/bf-blender-cvs
