[Bf-blender-cvs] [92ce4a8f35] temp_cycles_split_kernel: Cycles: CUDA implementation of split kernel

Mai Lavelle Wed, 22 Feb 2017 06:59:46 -0800

Commit: 92ce4a8f35f991e2c0eb99cebcdcb34ddd942ee4
Author: Mai Lavelle
Date:   Tue Feb 14 05:50:29 2017 -0500
Branches: temp_cycles_split_kernel
https://developer.blender.org/rB92ce4a8f35f991e2c0eb99cebcdcb34ddd942ee4


Cycles: CUDA implementation of split kernel

===================================================================

M       intern/cycles/blender/addon/properties.py
M       intern/cycles/blender/addon/ui.py
M       intern/cycles/blender/blender_python.cpp
M       intern/cycles/device/device_cuda.cpp
M       intern/cycles/kernel/CMakeLists.txt
M       intern/cycles/kernel/kernel_compat_cuda.h
M       intern/cycles/kernel/kernel_types.h
M       intern/cycles/kernel/kernels/cuda/kernel.cu
A       intern/cycles/kernel/kernels/cuda/kernel_config.h
A       intern/cycles/kernel/kernels/cuda/kernel_split.cu
M       intern/cycles/kernel/split/kernel_split_data.h
M       intern/cycles/util/util_debug.cpp
M       intern/cycles/util/util_debug.h

===================================================================

diff --git a/intern/cycles/blender/addon/properties.py 
b/intern/cycles/blender/addon/properties.py
index 1f0b712c93..ca10973431 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -668,6 +668,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         cls.debug_use_cpu_split_kernel = BoolProperty(name="Split Kernel", 
default=False)
 
         cls.debug_use_cuda_adaptive_compile = BoolProperty(name="Adaptive 
Compile", default=False)
+        cls.debug_use_cuda_split_kernel = BoolProperty(name="Split Kernel", 
default=False)
 
         cls.debug_opencl_kernel_type = EnumProperty(
             name="OpenCL Kernel Type",
diff --git a/intern/cycles/blender/addon/ui.py 
b/intern/cycles/blender/addon/ui.py
index 8d3fe87759..7c1e3e270f 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -1523,6 +1523,7 @@ class CyclesRender_PT_debug(CyclesButtonsPanel, Panel):
         col = layout.column()
         col.label('CUDA Flags:')
         col.prop(cscene, "debug_use_cuda_adaptive_compile")
+        col.prop(cscene, "debug_use_cuda_split_kernel")
 
         col = layout.column()
         col.label('OpenCL Flags:')
diff --git a/intern/cycles/blender/blender_python.cpp 
b/intern/cycles/blender/blender_python.cpp
index ed410e15e7..75118c4374 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -70,6 +70,7 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
        flags.cpu.split_kernel = get_boolean(cscene, 
"debug_use_cpu_split_kernel");
        /* Synchronize CUDA flags. */
        flags.cuda.adaptive_compile = get_boolean(cscene, 
"debug_use_cuda_adaptive_compile");
+       flags.cuda.split_kernel = get_boolean(cscene, 
"debug_use_cuda_split_kernel");
        /* Synchronize OpenCL kernel type. */
        switch(get_enum(cscene, "debug_opencl_kernel_type")) {
                case 0:
diff --git a/intern/cycles/device/device_cuda.cpp 
b/intern/cycles/device/device_cuda.cpp
index 4a159eee9f..2409528df4 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -21,6 +21,7 @@
 
 #include "device.h"
 #include "device_intern.h"
+#include "device_split_kernel.h"
 
 #include "buffers.h"
 
@@ -42,6 +43,8 @@
 #include "util_types.h"
 #include "util_time.h"
 
+#include "split/kernel_split_data.h"
+
 CCL_NAMESPACE_BEGIN
 
 #ifndef WITH_CUDA_DYNLOAD
@@ -78,6 +81,29 @@ int cuewCompilerVersion(void)
 }  /* namespace */
 #endif  /* WITH_CUDA_DYNLOAD */
 
+class CUDADevice;
+
+class CUDASplitKernel : public DeviceSplitKernel {
+       CUDADevice *device;
+public:
+       explicit CUDASplitKernel(CUDADevice *device);
+
+       virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
+                                                   RenderTile& rtile,
+                                                   int num_global_elements,
+                                                   device_memory& 
kernel_globals,
+                                                   device_memory& kernel_data_,
+                                                   device_memory& split_data,
+                                                   device_memory& ray_state,
+                                                   device_memory& queue_index,
+                                                   device_memory& 
use_queues_flag,
+                                                   device_memory& 
work_pool_wgs);
+
+       virtual SplitKernelFunction* get_split_kernel_function(string 
kernel_name, const DeviceRequestedFeatures&);
+       virtual int2 split_kernel_local_size();
+       virtual int2 split_kernel_global_size(DeviceTask *task);
+};
+
 class CUDADevice : public Device
 {
 public:
@@ -258,11 +284,16 @@ public:
                return DebugFlags().cuda.adaptive_compile;
        }
 
+       bool use_split_kernel()
+       {
+               return DebugFlags().cuda.split_kernel;
+       }
+
        /* Common NVCC flags which stays the same regardless of shading model,
         * kernel sources md5 and only depends on compiler or compilation 
settings.
         */
        string compile_kernel_get_common_cflags(
-               const DeviceRequestedFeatures& requested_features)
+               const DeviceRequestedFeatures& requested_features, bool 
split=false)
        {
                const int cuda_version = cuewCompilerVersion();
                const int machine = system_cpu_bits();
@@ -287,6 +318,11 @@ public:
 #ifdef WITH_CYCLES_DEBUG
                cflags += " -D__KERNEL_DEBUG__";
 #endif
+
+               if(split) {
+                       cflags += " -D__SPLIT__";
+               }
+
                return cflags;
        }
 
@@ -320,7 +356,7 @@ public:
                return true;
        }
 
-       string compile_kernel(const DeviceRequestedFeatures& requested_features)
+       string compile_kernel(const DeviceRequestedFeatures& 
requested_features, bool split=false)
        {
                /* Compute cubin name. */
                int major, minor;
@@ -329,7 +365,8 @@ public:
 
                /* Attempt to use kernel provided with Blender. */
                if(!use_adaptive_compilation()) {
-                       const string cubin = 
path_get(string_printf("lib/kernel_sm_%d%d.cubin",
+                       const string cubin = path_get(string_printf(split ? 
"lib/kernel_split_sm_%d%d.cubin"
+                                                                         : 
"lib/kernel_sm_%d%d.cubin",
                                                                    major, 
minor));
                        VLOG(1) << "Testing for pre-compiled kernel " << cubin 
<< ".";
                        if(path_exists(cubin)) {
@@ -339,7 +376,7 @@ public:
                }
 
                const string common_cflags =
-                       compile_kernel_get_common_cflags(requested_features);
+                       compile_kernel_get_common_cflags(requested_features, 
split);
 
                /* Try to use locally compiled kernel. */
                const string kernel_path = path_get("kernel");
@@ -350,7 +387,8 @@ public:
                 */
                const string cubin_md5 = util_md5_string(kernel_md5 + 
common_cflags);
 
-               const string cubin_file = 
string_printf("cycles_kernel_sm%d%d_%s.cubin",
+               const string cubin_file = string_printf(split ? 
"cycles_kernel_split_sm%d%d_%s.cubin"
+                                                             : 
"cycles_kernel_sm%d%d_%s.cubin",
                                                        major, minor,
                                                        cubin_md5.c_str());
                const string cubin = path_cache_get(path_join("kernels", 
cubin_file));
@@ -385,7 +423,7 @@ public:
                const char *nvcc = cuewCompilerPath();
                const string kernel = path_join(kernel_path,
                                          path_join("kernels",
-                                                   path_join("cuda", 
"kernel.cu")));
+                                                   path_join("cuda", split ? 
"kernel_split.cu" : "kernel.cu")));
                double starttime = time_dt();
                printf("Compiling CUDA kernel ...\n");
 
@@ -433,7 +471,7 @@ public:
                        return false;
 
                /* get kernel */
-               string cubin = compile_kernel(requested_features);
+               string cubin = compile_kernel(requested_features, 
use_split_kernel());
 
                if(cubin == "")
                        return false;
@@ -1260,25 +1298,48 @@ public:
                        /* Upload Bindless Mapping */
                        load_bindless_mapping();
 
-                       /* keep rendering tiles until done */
-                       while(task->acquire_tile(this, tile)) {
-                               int start_sample = tile.start_sample;
-                               int end_sample = tile.start_sample + 
tile.num_samples;
+                       if(!use_split_kernel()) {
+                               /* keep rendering tiles until done */
+                               while(task->acquire_tile(this, tile)) {
+                                       int start_sample = tile.start_sample;
+                                       int end_sample = tile.start_sample + 
tile.num_samples;
 
-                               for(int sample = start_sample; sample < 
end_sample; sample++) {
-                                       if(task->get_cancel()) {
-                                               if(task->need_finish_queue == 
false)
-                                                       break;
-                                       }
+                                       for(int sample = start_sample; sample < 
end_sample; sample++) {
+                                               if(task->get_cancel()) {
+                                                       
if(task->need_finish_queue == false)
+                                                               break;
+                                               }
 
-                                       path_trace(tile, sample, branched);
+                                               path_trace(tile, sample, 
branched);
 
-                                       tile.sample = sample + 1;
+                                               tile.sample = sample + 1;
 
-                                       task->update_progress(&tile, 
tile.w*tile.h);
+                                               task->update_progress(&tile, 
tile.w*tile.h);
+                                       }
+
+                                       task->release_tile(tile);
                                }
+                       }
+                       else {
+                               DeviceRequestedFeatures requested_features;
+                               if(!use_adaptive_compilation()) {
+                                       requested_features.max_closure = 64;
+                               }
+
+                               CUDASplitKernel split_kernel(this);
+                               split_kernel.load_kernels(requested_features);
+
+                               while(task->acquire_tile(this, tile)) {
+                                       device_memory void_buffer;
+                                       split_kernel.path_trace(task, tile, 
void_buffer, void_buffer);
+
+                                       task->release_tile(tile);
 
-                               task->release_tile(tile);
+                                       if(task->get_cancel()) {
+                                               if(task->need_finish_queue == 
false)
+                                                       break;
+                                       }
+                               }
                        }
                }
                else if(task->type == DeviceTask::SHADER) {
@@ -1331,8 +1392,186 @@ public:
        {
                task_pool.cancel();
        }
+
+       friend class CUDASplitKernelFunction;
+       friend class CUDASplitKernel;
 };
 
+/* redefine the cuda_assert macro so it can be used outside of the CUDADevice 
class
+ * now that the definition of that class is complete
+ */
+#undef cuda_assert
+#define cuda_assert(stmt) \
+       { \
+               CUresult result = stmt; \
+               \
+               if(result != CUDA_SUCCESS) { \
+                       string message = string_printf("CUDA error: %s in %s", 
cuewErrorString(result), #stmt); \
+                       if(device->error_msg == "") \
+                               device->error_msg = message; \
+                       fprintf(stderr, "%s\n", message.c_str()); \
+                       /*cuda_abort();*/ \
+                       device->cuda_error_documentation(); \
+               } \
+       } (void)0
+
+/* split kernel */
+
+class CUDASplitKernelFunction : public SplitKernelFunction{
+       CUDADevice* device;
+       CUfunction func;
+public:
+       CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : 
device(device), func(func) {}
+
+       /* enqueue the kernel, returns false if there is an error */
+       bool enqueue(const KernelDimensions &dim, device_memory &/*kg*/, 
device_memory &/*data*/)
+       {
+               return enqueue(dim, NULL);
+       }
+
+       /* enqueue the kernel, returns false if there is an error */
+       bool enqueue(const KernelDimensions &dim, void *args[])
+       {
+               device->cuda_push_context();
+
+               if(device->have_error())
+                       return false;
+
+               /* we ignore dim.local_size for now, as this is faster */
+               int threads_per_block;
+               cuda_assert(cuFuncGetAttribute(&threads_per_block, 
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
+
+               int xthreads = (int)sqrt(threads_per_block);
+               int ythreads = (int)sqrt(threads_per_block);
+
+               int xblocks = (dim.global_size[0] + xthreads - 1)/xthreads;
+               int yblocks = (d

@@ Diff output truncated at 10240 characters. @@

_______________________________________________
Bf-blender-cvs mailing list
[email protected]
https://lists.blender.org/mailman/listinfo/bf-blender-cvs

[Bf-blender-cvs] [92ce4a8f35] temp_cycles_split_kernel: Cycles: CUDA implementation of split kernel

Reply via email to