Commit: eb293f59f2eb9847b8fd593ac2dde2781ac8ace1
Author: Mai Lavelle
Date:   Thu May 11 19:23:49 2017 -0400
Branches: master
https://developer.blender.org/rBeb293f59f2eb9847b8fd593ac2dde2781ac8ace1

Cycles: Pass all buffers to each kernel call for OpenCL

Technically not passing all buffers used by a kernel is undefined
behavior. We haven't had any issues with this so far on AMD or
Nvidia, but it's known to be a problem with Intel and we received
a report from AMD that this is a problem on newer hardware, so we
need to make this change at some point.

Unfortunately there a cost to being correct, about 5% for the
benchmark scenes. For low sample counts it's even worse, I've
seen up to 50% slowdown. For the latter case I think adjusting
tile updating logic can help, but not sure what that would look
like yet (it would be just a few lines change however).

===================================================================

M       intern/cycles/device/opencl/opencl_split.cpp
M       intern/cycles/kernel/CMakeLists.txt
M       intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl
M       intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
M       intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl
M       intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl
M       
intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
M       intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl
M       intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl
M       intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
M       intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
M       intern/cycles/kernel/kernels/opencl/kernel_path_init.cl
M       intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
M       intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
M       intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
M       intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl
M       intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl
M       intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl
M       intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl
A       intern/cycles/kernel/kernels/opencl/kernel_split_function.h
M       intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl

===================================================================

diff --git a/intern/cycles/device/opencl/opencl_split.cpp 
b/intern/cycles/device/opencl/opencl_split.cpp
index 76dcbd6fc9a..08b632ee9d3 100644
--- a/intern/cycles/device/opencl/opencl_split.cpp
+++ b/intern/cycles/device/opencl/opencl_split.cpp
@@ -176,17 +176,62 @@ protected:
        friend class OpenCLSplitKernelFunction;
 };
 
+struct CachedSplitMemory {
+       int id;
+       device_memory *split_data;
+       device_memory *ray_state;
+       device_ptr *rng_state;
+       device_memory *queue_index;
+       device_memory *use_queues_flag;
+       device_memory *work_pools;
+       device_ptr *buffer;
+};
+
 class OpenCLSplitKernelFunction : public SplitKernelFunction {
 public:
        OpenCLDeviceSplitKernel* device;
        OpenCLDeviceBase::OpenCLProgram program;
+       CachedSplitMemory& cached_memory;
+       int cached_id;
+
+       OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device, 
CachedSplitMemory& cached_memory) :
+                       device(device), cached_memory(cached_memory), 
cached_id(cached_memory.id-1)
+       {
+       }
 
-       OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device) : 
device(device) {}
-       ~OpenCLSplitKernelFunction() { program.release(); }
+       ~OpenCLSplitKernelFunction()
+       {
+               program.release();
+       }
 
        virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, 
device_memory& data)
        {
-               device->kernel_set_args(program(), 0, kg, data);
+               if(cached_id != cached_memory.id) {
+                       cl_uint start_arg_index =
+                               device->kernel_set_args(program(),
+                                                   0,
+                                                   kg,
+                                                   data,
+                                                   *cached_memory.split_data,
+                                                   *cached_memory.ray_state,
+                                                   *cached_memory.rng_state);
+
+/* TODO(sergey): Avoid map lookup here. */
+#define KERNEL_TEX(type, ttype, name) \
+                               device->set_kernel_arg_mem(program(), 
&start_arg_index, #name);
+#include "kernel/kernel_textures.h"
+#undef KERNEL_TEX
+
+                       start_arg_index +=
+                               device->kernel_set_args(program(),
+                                                   start_arg_index,
+                                                   *cached_memory.queue_index,
+                                                   
*cached_memory.use_queues_flag,
+                                                   *cached_memory.work_pools,
+                                                   *cached_memory.buffer);
+
+                       cached_id = cached_memory.id;
+               }
 
                device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
                                                       program(),
@@ -213,6 +258,7 @@ public:
 
 class OpenCLSplitKernel : public DeviceSplitKernel {
        OpenCLDeviceSplitKernel *device;
+       CachedSplitMemory cached_memory;
 public:
        explicit OpenCLSplitKernel(OpenCLDeviceSplitKernel *device) : 
DeviceSplitKernel(device), device(device) {
        }
@@ -220,7 +266,7 @@ public:
        virtual SplitKernelFunction* get_split_kernel_function(string 
kernel_name,
                                                               const 
DeviceRequestedFeatures& requested_features)
        {
-               OpenCLSplitKernelFunction* kernel = new 
OpenCLSplitKernelFunction(device);
+               OpenCLSplitKernelFunction* kernel = new 
OpenCLSplitKernelFunction(device, cached_memory);
 
                bool single_program = OpenCLInfo::use_single_program();
                kernel->program =
@@ -349,6 +395,15 @@ public:
                        return false;
                }
 
+               cached_memory.split_data = &split_data;
+               cached_memory.ray_state = &ray_state;
+               cached_memory.rng_state = &rtile.rng_state;
+               cached_memory.queue_index = &queue_index;
+               cached_memory.use_queues_flag = &use_queues_flag;
+               cached_memory.work_pools = &work_pool_wgs;
+               cached_memory.buffer = &rtile.buffer;
+               cached_memory.id++;
+
                return true;
        }
 
diff --git a/intern/cycles/kernel/CMakeLists.txt 
b/intern/cycles/kernel/CMakeLists.txt
index b85067d4e66..23e9bd311c4 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -122,6 +122,10 @@ set(SRC_KERNELS_CUDA_HEADERS
        kernels/cuda/kernel_config.h
 )
 
+set(SRC_KERNELS_OPENCL_HEADERS
+       kernels/opencl/kernel_split_function.h
+)
+
 set(SRC_CLOSURE_HEADERS
        closure/alloc.h
        closure/bsdf.h
@@ -452,6 +456,7 @@ add_library(cycles_kernel
        ${SRC_HEADERS}
        ${SRC_KERNELS_CPU_HEADERS}
        ${SRC_KERNELS_CUDA_HEADERS}
+       ${SRC_KERNELS_OPENCL_HEADERS}
        ${SRC_BVH_HEADERS}
        ${SRC_CLOSURE_HEADERS}
        ${SRC_FILTER_HEADERS}
@@ -496,6 +501,7 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} 
"kernels/opencl/kernel_enqueue_inact
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} 
"kernels/opencl/kernel_next_iteration_setup.cl" 
${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} 
"kernels/opencl/kernel_indirect_subsurface.cl" 
${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} 
"kernels/opencl/kernel_buffer_update.cl" 
${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} 
"kernels/opencl/kernel_split_function.h" 
${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/filter.cl" 
${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" 
${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" 
${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl 
b/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl
index db65c91baf7..dcea2630aef 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl
@@ -18,10 +18,9 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_buffer_update.h"
 
-__kernel void kernel_ocl_path_trace_buffer_update(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-       ccl_local unsigned int local_queue_atomics;
-       kernel_buffer_update((KernelGlobals*)kg, &local_queue_atomics);
-}
+#define KERNEL_NAME buffer_update
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl 
b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
index eb34f750881..ed64ae01aae 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
@@ -18,10 +18,9 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_direct_lighting.h"
 
-__kernel void kernel_ocl_path_trace_direct_lighting(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-       ccl_local unsigned int local_queue_atomics;
-       kernel_direct_lighting((KernelGlobals*)kg, &local_queue_atomics);
-}
+#define KERNEL_NAME direct_lighting
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl 
b/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl
index 83ef5f5f3f2..8afaa686e28 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl
@@ -18,9 +18,7 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_do_volume.h"
 
-__kernel void kernel_ocl_path_trace_do_volume(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-       kernel_do_volume((KernelGlobals*)kg);
-}
+#define KERNEL_NAME do_volume
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl 
b/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl
index 940f3b890a4..e68d4104a91 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl
@@ -18,10 +18,9 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_enqueue_inactive.h"
 
-__kernel void kernel_ocl_path_trace_enqueue_inactive(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-       ccl_local unsigned int local_queue_atomics;
-       kernel_enqueue_inactive((KernelGlobals*)kg, &local_queue_atomics);
-}
+#define KERNEL_NAME enqueue_inactive
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git 
a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
 
b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
index d071b39aa6f..9e1e57beba6 100644
--- 
a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
+++ 
b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
@@ -18,12 +18,9 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
 
-__kernel void 
kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-       ccl_local BackgroundAOLocals locals;
-       kernel_holdout_emission_blurring_pathtermination_ao(
-               (KernelGlobals*)kg,
-               &locals);
-}
+#define KERNEL_NAME holdout_emission_blurring_pathtermination_ao
+#define LOCALS_TYPE BackgroundAOLocals
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl 
b/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl
index 8c213ff5cb2..192d01444ba 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl
@@ -18,9 +18,7 @@
 #include "kernel/split/kernel_split_common.h"
 #include "kernel/split/kernel_indirect_background.h"
 
-__kernel void kernel_ocl_path_trace_indirect_background(
-        ccl_global char *kg,
-        ccl_constant KernelData *data)
-{
-       kernel_indirect_background((KernelGlobals*)kg);
-}
+#define KERNEL_NAME indirect_background
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indi

@@ Diff output truncated at 10240 characters. @@

_______________________________________________
Bf-blender-cvs mailing list
Bf-blender-cvs@blender.org
https://lists.blender.org/mailman/listinfo/bf-blender-cvs

Reply via email to