Commit: 6bf4115c13962c99d1cdc97f2be92c4922f3fd33 Author: Hristo Gueorguiev Date: Wed May 3 15:30:45 2017 +0200 Branches: master https://developer.blender.org/rB6bf4115c13962c99d1cdc97f2be92c4922f3fd33
Cycles: Split kernel - sort shaders Reduce thread divergence in kernel_shader_eval. Rays are sorted in blocks of 2048 according to shader->id. On R9 290 Classroom is ~30% faster, and Pabellon Barcelone is ~8% faster. No sorting for CUDA split kernel. Reviewers: sergey, maiself Reviewed By: maiself Differential Revision: https://developer.blender.org/D2598 =================================================================== M intern/cycles/device/device_split_kernel.cpp M intern/cycles/device/device_split_kernel.h M intern/cycles/kernel/CMakeLists.txt M intern/cycles/kernel/kernel_types.h M intern/cycles/kernel/kernels/cpu/kernel_cpu.h M intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h M intern/cycles/kernel/kernels/cuda/kernel_split.cu M intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl A intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl A intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl M intern/cycles/kernel/kernels/opencl/kernel_split.cl M intern/cycles/kernel/split/kernel_shader_eval.h A intern/cycles/kernel/split/kernel_shader_setup.h A intern/cycles/kernel/split/kernel_shader_sort.h M intern/cycles/kernel/split/kernel_split_data_types.h =================================================================== diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp index bb3089c5418..9118793aad6 100644 --- a/intern/cycles/device/device_split_kernel.cpp +++ b/intern/cycles/device/device_split_kernel.cpp @@ -39,6 +39,8 @@ DeviceSplitKernel::DeviceSplitKernel(Device *device) : device(device) kernel_do_volume = NULL; kernel_queue_enqueue = NULL; kernel_indirect_background = NULL; + kernel_shader_setup = NULL; + kernel_shader_sort = NULL; kernel_shader_eval = NULL; kernel_holdout_emission_blurring_pathtermination_ao = NULL; kernel_subsurface_scatter = NULL; @@ -64,6 +66,8 @@ DeviceSplitKernel::~DeviceSplitKernel() delete kernel_do_volume; delete kernel_queue_enqueue; delete kernel_indirect_background; + delete kernel_shader_setup; + delete kernel_shader_sort; delete kernel_shader_eval; delete kernel_holdout_emission_blurring_pathtermination_ao; delete kernel_subsurface_scatter; @@ -89,6 +93,8 @@ bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_fe LOAD_KERNEL(do_volume); LOAD_KERNEL(queue_enqueue); LOAD_KERNEL(indirect_background); + LOAD_KERNEL(shader_setup); + LOAD_KERNEL(shader_sort); LOAD_KERNEL(shader_eval); LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao); LOAD_KERNEL(subsurface_scatter); @@ -241,6 +247,8 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task, ENQUEUE_SPLIT_KERNEL(do_volume, global_size, local_size); ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size); ENQUEUE_SPLIT_KERNEL(indirect_background, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(shader_setup, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(shader_sort, global_size, local_size); ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size); ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size); ENQUEUE_SPLIT_KERNEL(subsurface_scatter, global_size, local_size); diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h index 55548122c0c..58c2fdbb077 100644 --- a/intern/cycles/device/device_split_kernel.h +++ b/intern/cycles/device/device_split_kernel.h @@ -61,6 +61,8 @@ private: SplitKernelFunction *kernel_do_volume; SplitKernelFunction *kernel_queue_enqueue; SplitKernelFunction *kernel_indirect_background; + SplitKernelFunction *kernel_shader_setup; + SplitKernelFunction *kernel_shader_sort; SplitKernelFunction *kernel_shader_eval; SplitKernelFunction *kernel_holdout_emission_blurring_pathtermination_ao; SplitKernelFunction *kernel_subsurface_scatter; diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index a92e8bc4aee..9bb0455b9d5 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -21,6 +21,8 @@ set(SRC kernels/opencl/kernel_lamp_emission.cl kernels/opencl/kernel_do_volume.cl kernels/opencl/kernel_indirect_background.cl + kernels/opencl/kernel_shader_setup.cl + kernels/opencl/kernel_shader_sort.cl kernels/opencl/kernel_shader_eval.cl kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl kernels/opencl/kernel_subsurface_scatter.cl @@ -248,6 +250,8 @@ set(SRC_SPLIT_HEADERS split/kernel_path_init.h split/kernel_queue_enqueue.h split/kernel_scene_intersect.h + split/kernel_shader_setup.h + split/kernel_shader_sort.h split/kernel_shader_eval.h split/kernel_shadow_blocked_ao.h split/kernel_shadow_blocked_dl.h @@ -457,6 +461,8 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_scene_interse delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_lamp_emission.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_do_volume.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_background.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_setup.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_sort.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_eval.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_subsurface_scatter.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl) diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index 6417f621c8f..9b354457b91 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -64,6 +64,18 @@ CCL_NAMESPACE_BEGIN # define WORK_POOL_SIZE WORK_POOL_SIZE_CPU #endif + +#define SHADER_SORT_BLOCK_SIZE 2048 + +#ifdef __KERNEL_OPENCL__ +# define SHADER_SORT_LOCAL_SIZE 64 +#elif defined(__KERNEL_CUDA__) +# define SHADER_SORT_LOCAL_SIZE 32 +#else +# define SHADER_SORT_LOCAL_SIZE 1 +#endif + + /* device capabilities */ #ifdef __KERNEL_CPU__ # ifdef __KERNEL_SSE2__ @@ -1321,6 +1333,9 @@ enum QueueNumber { */ QUEUE_SHADOW_RAY_CAST_DL_RAYS, + /* Rays sorted according to shader->id */ + QUEUE_SHADER_SORTED_RAYS, + #ifdef __BRANCHED_PATH__ /* All rays moving to next iteration of the indirect loop for light */ QUEUE_LIGHT_INDIRECT_ITER, diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h index 896b80d783e..39c9a9cf33c 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h @@ -77,6 +77,8 @@ DECLARE_SPLIT_KERNEL_FUNCTION(lamp_emission) DECLARE_SPLIT_KERNEL_FUNCTION(do_volume) DECLARE_SPLIT_KERNEL_FUNCTION(queue_enqueue) DECLARE_SPLIT_KERNEL_FUNCTION(indirect_background) +DECLARE_SPLIT_KERNEL_FUNCTION(shader_setup) +DECLARE_SPLIT_KERNEL_FUNCTION(shader_sort) DECLARE_SPLIT_KERNEL_FUNCTION(shader_eval) DECLARE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao) DECLARE_SPLIT_KERNEL_FUNCTION(subsurface_scatter) diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h index 96f54bb427e..8c05dd1d9ef 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h @@ -44,6 +44,8 @@ # include "kernel/split/kernel_do_volume.h" # include "kernel/split/kernel_queue_enqueue.h" # include "kernel/split/kernel_indirect_background.h" +# include "kernel/split/kernel_shader_setup.h" +# include "kernel/split/kernel_shader_sort.h" # include "kernel/split/kernel_shader_eval.h" # include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h" # include "kernel/split/kernel_subsurface_scatter.h" @@ -181,7 +183,9 @@ DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission) DEFINE_SPLIT_KERNEL_FUNCTION(do_volume) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals) DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_eval, uint) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals) +DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals) DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint) @@ -209,6 +213,8 @@ void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, REGISTER(do_volume); REGISTER(queue_enqueue); REGISTER(indirect_background); + REGISTER(shader_setup); + REGISTER(shader_sort); REGISTER(shader_eval); REGISTER(holdout_emission_blurring_pathtermination_ao); REGISTER(subsurface_scatter); diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu index 585b91876a9..8b7f1a8d405 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu +++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu @@ -31,6 +31,8 @@ #include "kernel/split/kernel_do_volume.h" #include "kernel/split/kernel_queue_enqueue.h" #include "kernel/split/kernel_indirect_background.h" +#include "kernel/split/kernel_shader_setup.h" +#include "kernel/split/kernel_shader_sort.h" #include "kernel/split/kernel_shader_eval.h" #include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h" #include "kernel/split/kernel_subsurface_scatter.h" @@ -108,7 +110,9 @@ DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission) DEFINE_SPLIT_KERNEL_FUNCTION(do_volume) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals) DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_eval, uint) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint) +DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals) +DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals) DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter) DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint) diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl index 6baee460986..5bfb31b193a 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl @@ -22,6 +22,5 @@ __kernel void kernel_ocl_path_trace_shader_eval( ccl_global char *kg, ccl_constant KernelData *data) { - ccl_local unsigned int local_queue_atomics; - kernel_shader_eval((KernelGlobals*)kg, &local_queue_atomics); + kernel_shader_eval((KernelGlobals*)kg); } diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl similarity index 79% copy from intern/cycles/kernel/kern @@ Diff output truncated at 10240 characters. @@ _______________________________________________ Bf-blender-cvs mailing list [email protected] https://lists.blender.org/mailman/listinfo/bf-blender-cvs
