Commit: 6bf4115c13962c99d1cdc97f2be92c4922f3fd33
Author: Hristo Gueorguiev
Date:   Wed May 3 15:30:45 2017 +0200
Branches: master
https://developer.blender.org/rB6bf4115c13962c99d1cdc97f2be92c4922f3fd33

Cycles: Split kernel - sort shaders

Reduce thread divergence in kernel_shader_eval.

Rays are sorted in blocks of 2048 according to shader->id.

On R9 290 Classroom is ~30% faster, and Pabellon Barcelone is ~8% faster.

No sorting for CUDA split kernel.

Reviewers: sergey, maiself

Reviewed By: maiself

Differential Revision: https://developer.blender.org/D2598

===================================================================

M       intern/cycles/device/device_split_kernel.cpp
M       intern/cycles/device/device_split_kernel.h
M       intern/cycles/kernel/CMakeLists.txt
M       intern/cycles/kernel/kernel_types.h
M       intern/cycles/kernel/kernels/cpu/kernel_cpu.h
M       intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
M       intern/cycles/kernel/kernels/cuda/kernel_split.cu
M       intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
A       intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl
A       intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl
M       intern/cycles/kernel/kernels/opencl/kernel_split.cl
M       intern/cycles/kernel/split/kernel_shader_eval.h
A       intern/cycles/kernel/split/kernel_shader_setup.h
A       intern/cycles/kernel/split/kernel_shader_sort.h
M       intern/cycles/kernel/split/kernel_split_data_types.h

===================================================================

diff --git a/intern/cycles/device/device_split_kernel.cpp 
b/intern/cycles/device/device_split_kernel.cpp
index bb3089c5418..9118793aad6 100644
--- a/intern/cycles/device/device_split_kernel.cpp
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -39,6 +39,8 @@ DeviceSplitKernel::DeviceSplitKernel(Device *device) : 
device(device)
        kernel_do_volume = NULL;
        kernel_queue_enqueue = NULL;
        kernel_indirect_background = NULL;
+       kernel_shader_setup = NULL;
+       kernel_shader_sort = NULL;
        kernel_shader_eval = NULL;
        kernel_holdout_emission_blurring_pathtermination_ao = NULL;
        kernel_subsurface_scatter = NULL;
@@ -64,6 +66,8 @@ DeviceSplitKernel::~DeviceSplitKernel()
        delete kernel_do_volume;
        delete kernel_queue_enqueue;
        delete kernel_indirect_background;
+       delete kernel_shader_setup;
+       delete kernel_shader_sort;
        delete kernel_shader_eval;
        delete kernel_holdout_emission_blurring_pathtermination_ao;
        delete kernel_subsurface_scatter;
@@ -89,6 +93,8 @@ bool DeviceSplitKernel::load_kernels(const 
DeviceRequestedFeatures& requested_fe
        LOAD_KERNEL(do_volume);
        LOAD_KERNEL(queue_enqueue);
        LOAD_KERNEL(indirect_background);
+       LOAD_KERNEL(shader_setup);
+       LOAD_KERNEL(shader_sort);
        LOAD_KERNEL(shader_eval);
        LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
        LOAD_KERNEL(subsurface_scatter);
@@ -241,6 +247,8 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
                                ENQUEUE_SPLIT_KERNEL(do_volume, global_size, 
local_size);
                                ENQUEUE_SPLIT_KERNEL(queue_enqueue, 
global_size, local_size);
                                ENQUEUE_SPLIT_KERNEL(indirect_background, 
global_size, local_size);
+                               ENQUEUE_SPLIT_KERNEL(shader_setup, global_size, 
local_size);
+                               ENQUEUE_SPLIT_KERNEL(shader_sort, global_size, 
local_size);
                                ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, 
local_size);
                                
ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, 
local_size);
                                ENQUEUE_SPLIT_KERNEL(subsurface_scatter, 
global_size, local_size);
diff --git a/intern/cycles/device/device_split_kernel.h 
b/intern/cycles/device/device_split_kernel.h
index 55548122c0c..58c2fdbb077 100644
--- a/intern/cycles/device/device_split_kernel.h
+++ b/intern/cycles/device/device_split_kernel.h
@@ -61,6 +61,8 @@ private:
        SplitKernelFunction *kernel_do_volume;
        SplitKernelFunction *kernel_queue_enqueue;
        SplitKernelFunction *kernel_indirect_background;
+       SplitKernelFunction *kernel_shader_setup;
+       SplitKernelFunction *kernel_shader_sort;
        SplitKernelFunction *kernel_shader_eval;
        SplitKernelFunction 
*kernel_holdout_emission_blurring_pathtermination_ao;
        SplitKernelFunction *kernel_subsurface_scatter;
diff --git a/intern/cycles/kernel/CMakeLists.txt 
b/intern/cycles/kernel/CMakeLists.txt
index a92e8bc4aee..9bb0455b9d5 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -21,6 +21,8 @@ set(SRC
        kernels/opencl/kernel_lamp_emission.cl
        kernels/opencl/kernel_do_volume.cl
        kernels/opencl/kernel_indirect_background.cl
+       kernels/opencl/kernel_shader_setup.cl
+       kernels/opencl/kernel_shader_sort.cl
        kernels/opencl/kernel_shader_eval.cl
        kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
        kernels/opencl/kernel_subsurface_scatter.cl
@@ -248,6 +250,8 @@ set(SRC_SPLIT_HEADERS
        split/kernel_path_init.h
        split/kernel_queue_enqueue.h
        split/kernel_scene_intersect.h
+       split/kernel_shader_setup.h
+       split/kernel_shader_sort.h
        split/kernel_shader_eval.h
        split/kernel_shadow_blocked_ao.h
        split/kernel_shadow_blocked_dl.h
@@ -457,6 +461,8 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} 
"kernels/opencl/kernel_scene_interse
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} 
"kernels/opencl/kernel_lamp_emission.cl" 
${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} 
"kernels/opencl/kernel_do_volume.cl" 
${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} 
"kernels/opencl/kernel_indirect_background.cl" 
${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} 
"kernels/opencl/kernel_shader_setup.cl" 
${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} 
"kernels/opencl/kernel_shader_sort.cl" 
${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} 
"kernels/opencl/kernel_shader_eval.cl" 
${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} 
"kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl" 
${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} 
"kernels/opencl/kernel_subsurface_scatter.cl" 
${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
diff --git a/intern/cycles/kernel/kernel_types.h 
b/intern/cycles/kernel/kernel_types.h
index 6417f621c8f..9b354457b91 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -64,6 +64,18 @@ CCL_NAMESPACE_BEGIN
 #  define WORK_POOL_SIZE WORK_POOL_SIZE_CPU
 #endif
 
+
+#define SHADER_SORT_BLOCK_SIZE 2048
+
+#ifdef __KERNEL_OPENCL__
+#  define SHADER_SORT_LOCAL_SIZE 64
+#elif defined(__KERNEL_CUDA__)
+#  define SHADER_SORT_LOCAL_SIZE 32
+#else
+#  define SHADER_SORT_LOCAL_SIZE 1
+#endif
+
+
 /* device capabilities */
 #ifdef __KERNEL_CPU__
 #  ifdef __KERNEL_SSE2__
@@ -1321,6 +1333,9 @@ enum QueueNumber {
         */
        QUEUE_SHADOW_RAY_CAST_DL_RAYS,
 
+       /* Rays sorted according to shader->id */
+       QUEUE_SHADER_SORTED_RAYS,
+
 #ifdef __BRANCHED_PATH__
        /* All rays moving to next iteration of the indirect loop for light */
        QUEUE_LIGHT_INDIRECT_ITER,
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h 
b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
index 896b80d783e..39c9a9cf33c 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
@@ -77,6 +77,8 @@ DECLARE_SPLIT_KERNEL_FUNCTION(lamp_emission)
 DECLARE_SPLIT_KERNEL_FUNCTION(do_volume)
 DECLARE_SPLIT_KERNEL_FUNCTION(queue_enqueue)
 DECLARE_SPLIT_KERNEL_FUNCTION(indirect_background)
+DECLARE_SPLIT_KERNEL_FUNCTION(shader_setup)
+DECLARE_SPLIT_KERNEL_FUNCTION(shader_sort)
 DECLARE_SPLIT_KERNEL_FUNCTION(shader_eval)
 DECLARE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
 DECLARE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h 
b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
index 96f54bb427e..8c05dd1d9ef 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
@@ -44,6 +44,8 @@
 #  include "kernel/split/kernel_do_volume.h"
 #  include "kernel/split/kernel_queue_enqueue.h"
 #  include "kernel/split/kernel_indirect_background.h"
+#  include "kernel/split/kernel_shader_setup.h"
+#  include "kernel/split/kernel_shader_sort.h"
 #  include "kernel/split/kernel_shader_eval.h"
 #  include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
 #  include "kernel/split/kernel_subsurface_scatter.h"
@@ -181,7 +183,9 @@ DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
 DEFINE_SPLIT_KERNEL_FUNCTION(do_volume)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
 DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_eval, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals)
+DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
 
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao,
 BackgroundAOLocals)
 DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
@@ -209,6 +213,8 @@ void 
KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name,
        REGISTER(do_volume);
        REGISTER(queue_enqueue);
        REGISTER(indirect_background);
+       REGISTER(shader_setup);
+       REGISTER(shader_sort);
        REGISTER(shader_eval);
        REGISTER(holdout_emission_blurring_pathtermination_ao);
        REGISTER(subsurface_scatter);
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu 
b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
index 585b91876a9..8b7f1a8d405 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
@@ -31,6 +31,8 @@
 #include "kernel/split/kernel_do_volume.h"
 #include "kernel/split/kernel_queue_enqueue.h"
 #include "kernel/split/kernel_indirect_background.h"
+#include "kernel/split/kernel_shader_setup.h"
+#include "kernel/split/kernel_shader_sort.h"
 #include "kernel/split/kernel_shader_eval.h"
 #include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
 #include "kernel/split/kernel_subsurface_scatter.h"
@@ -108,7 +110,9 @@ DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
 DEFINE_SPLIT_KERNEL_FUNCTION(do_volume)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
 DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_eval, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals)
+DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
 
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao,
 BackgroundAOLocals)
 DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl 
b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
index 6baee460986..5bfb31b193a 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
@@ -22,6 +22,5 @@ __kernel void kernel_ocl_path_trace_shader_eval(
         ccl_global char *kg,
         ccl_constant KernelData *data)
 {
-       ccl_local unsigned int local_queue_atomics;
-       kernel_shader_eval((KernelGlobals*)kg, &local_queue_atomics);
+       kernel_shader_eval((KernelGlobals*)kg);
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl 
b/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl
similarity index 79%
copy from intern/cycles/kernel/kern

@@ Diff output truncated at 10240 characters. @@

_______________________________________________
Bf-blender-cvs mailing list
[email protected]
https://lists.blender.org/mailman/listinfo/bf-blender-cvs

Reply via email to