Commit: 1bd5b5a83556243121f823cb0e109c5028b30c06
Author: Mai Lavelle
Date: Sat Dec 17 01:17:46 2016 -0500
Branches: cycles_split_kernel
https://developer.blender.org/rB1bd5b5a83556243121f823cb0e109c5028b30c06
Cycles: Implement split kernel for CUDA
Helpful to have another device to test against and check that the split
kernel remains compatible with all devices.
===================================================================
M intern/cycles/blender/addon/properties.py
M intern/cycles/blender/addon/ui.py
M intern/cycles/blender/blender_python.cpp
M intern/cycles/device/device_cuda.cpp
M intern/cycles/kernel/CMakeLists.txt
M intern/cycles/kernel/kernel_compat_cuda.h
M intern/cycles/kernel/kernel_compat_opencl.h
M intern/cycles/kernel/kernel_globals.h
M intern/cycles/kernel/kernel_queues.h
M intern/cycles/kernel/kernel_shadow.h
M intern/cycles/kernel/kernel_types.h
M intern/cycles/kernel/kernels/cuda/kernel.cu
A intern/cycles/kernel/kernels/cuda/kernel_config.h
A intern/cycles/kernel/kernels/cuda/kernel_split.cu
M intern/cycles/kernel/split/kernel_background_buffer_update.h
M intern/cycles/kernel/split/kernel_data_init.h
M intern/cycles/kernel/split/kernel_direct_lighting.h
M intern/cycles/kernel/split/kernel_lamp_emission.h
M intern/cycles/kernel/split/kernel_shadow_blocked.h
M intern/cycles/kernel/split/kernel_split_data.h
M intern/cycles/util/util_atomic.h
M intern/cycles/util/util_debug.cpp
M intern/cycles/util/util_debug.h
M intern/cycles/util/util_types.h
===================================================================
diff --git a/intern/cycles/blender/addon/properties.py
b/intern/cycles/blender/addon/properties.py
index 315550b..310b82e 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -648,6 +648,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
cls.debug_use_cpu_split_kernel = BoolProperty(name="Split Kernel",
default=False)
cls.debug_use_cuda_adaptive_compile = BoolProperty(name="Adaptive
Compile", default=False)
+ cls.debug_use_cuda_split_kernel = BoolProperty(name="Split Kernel",
default=False)
cls.debug_opencl_kernel_type = EnumProperty(
name="OpenCL Kernel Type",
diff --git a/intern/cycles/blender/addon/ui.py
b/intern/cycles/blender/addon/ui.py
index 22cf890..e1fa5be 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -1532,6 +1532,7 @@ class CyclesRender_PT_debug(CyclesButtonsPanel, Panel):
col = layout.column()
col.label('CUDA Flags:')
col.prop(cscene, "debug_use_cuda_adaptive_compile")
+ col.prop(cscene, "debug_use_cuda_split_kernel")
col = layout.column()
col.label('OpenCL Flags:')
diff --git a/intern/cycles/blender/blender_python.cpp
b/intern/cycles/blender/blender_python.cpp
index ed410e1..75118c4 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -70,6 +70,7 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
flags.cpu.split_kernel = get_boolean(cscene,
"debug_use_cpu_split_kernel");
/* Synchronize CUDA flags. */
flags.cuda.adaptive_compile = get_boolean(cscene,
"debug_use_cuda_adaptive_compile");
+ flags.cuda.split_kernel = get_boolean(cscene,
"debug_use_cuda_split_kernel");
/* Synchronize OpenCL kernel type. */
switch(get_enum(cscene, "debug_opencl_kernel_type")) {
case 0:
diff --git a/intern/cycles/device/device_cuda.cpp
b/intern/cycles/device/device_cuda.cpp
index 1316a3e..1b8b09b 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -21,6 +21,7 @@
#include "device.h"
#include "device_intern.h"
+#include "device_split_kernel.h"
#include "buffers.h"
@@ -42,6 +43,8 @@
#include "util_types.h"
#include "util_time.h"
+#include "split/kernel_split_data.h"
+
CCL_NAMESPACE_BEGIN
#ifndef WITH_CUDA_DYNLOAD
@@ -258,11 +261,16 @@ public:
return DebugFlags().cuda.adaptive_compile;
}
+ bool use_split_kernel()
+ {
+ return DebugFlags().cuda.split_kernel;
+ }
+
/* Common NVCC flags which stays the same regardless of shading model,
* kernel sources md5 and only depends on compiler or compilation
settings.
*/
string compile_kernel_get_common_cflags(
- const DeviceRequestedFeatures& requested_features)
+ const DeviceRequestedFeatures& requested_features, bool
split=false)
{
const int cuda_version = cuewCompilerVersion();
const int machine = system_cpu_bits();
@@ -287,6 +295,11 @@ public:
#ifdef WITH_CYCLES_DEBUG
cflags += " -D__KERNEL_DEBUG__";
#endif
+
+ if(split) {
+ cflags += " -D__SPLIT__";
+ }
+
return cflags;
}
@@ -320,7 +333,7 @@ public:
return true;
}
- string compile_kernel(const DeviceRequestedFeatures& requested_features)
+ string compile_kernel(const DeviceRequestedFeatures&
requested_features, bool split=false)
{
/* Compute cubin name. */
int major, minor;
@@ -329,7 +342,8 @@ public:
/* Attempt to use kernel provided with Blender. */
if(!use_adaptive_compilation()) {
- const string cubin =
path_get(string_printf("lib/kernel_sm_%d%d.cubin",
+ const string cubin = path_get(string_printf(split ?
"lib/kernel_split_sm_%d%d.cubin"
+ :
"lib/kernel_sm_%d%d.cubin",
major,
minor));
VLOG(1) << "Testing for pre-compiled kernel " << cubin
<< ".";
if(path_exists(cubin)) {
@@ -339,7 +353,7 @@ public:
}
const string common_cflags =
- compile_kernel_get_common_cflags(requested_features);
+ compile_kernel_get_common_cflags(requested_features,
split);
/* Try to use locally compiled kernel. */
const string kernel_path = path_get("kernel");
@@ -350,7 +364,8 @@ public:
*/
const string cubin_md5 = util_md5_string(kernel_md5 +
common_cflags);
- const string cubin_file =
string_printf("cycles_kernel_sm%d%d_%s.cubin",
+ const string cubin_file = string_printf(split ?
"cycles_kernel_split_sm%d%d_%s.cubin"
+ :
"cycles_kernel_sm%d%d_%s.cubin",
major, minor,
cubin_md5.c_str());
const string cubin = path_cache_get(path_join("kernels",
cubin_file));
@@ -385,7 +400,7 @@ public:
const char *nvcc = cuewCompilerPath();
const string kernel = path_join(kernel_path,
path_join("kernels",
- path_join("cuda",
"kernel.cu")));
+ path_join("cuda", split ?
"kernel_split.cu" : "kernel.cu")));
double starttime = time_dt();
printf("Compiling CUDA kernel ...\n");
@@ -433,7 +448,7 @@ public:
return false;
/* get kernel */
- string cubin = compile_kernel(requested_features);
+ string cubin = compile_kernel(requested_features,
use_split_kernel());
if(cubin == "")
return false;
@@ -1264,25 +1279,48 @@ public:
/* Upload Bindless Mapping */
load_bindless_mapping();
- /* keep rendering tiles until done */
- while(task->acquire_tile(this, tile)) {
- int start_sample = tile.start_sample;
- int end_sample = tile.start_sample +
tile.num_samples;
+ if(!use_split_kernel()) {
+ /* keep rendering tiles until done */
+ while(task->acquire_tile(this, tile)) {
+ int start_sample = tile.start_sample;
+ int end_sample = tile.start_sample +
tile.num_samples;
- for(int sample = start_sample; sample <
end_sample; sample++) {
- if(task->get_cancel()) {
- if(task->need_finish_queue ==
false)
- break;
- }
+ for(int sample = start_sample; sample <
end_sample; sample++) {
+ if(task->get_cancel()) {
+
if(task->need_finish_queue == false)
+ break;
+ }
- path_trace(tile, sample, branched);
+ path_trace(tile, sample,
branched);
- tile.sample = sample + 1;
+ tile.sample = sample + 1;
- task->update_progress(&tile,
tile.w*tile.h);
+ task->update_progress(&tile,
tile.w*tile.h);
+ }
+
+ task->release_tile(tile);
+ }
+ }
+ else {
+ DeviceRequestedFeatures requested_features;
+ if(!use_adaptive_compilation()) {
+ requested_features.max_closure = 64;
}
- task->release_tile(tile);
+ DeviceSplitKernel split_kernel(this);
+ split_kernel.load_kernels(requested_features);
+
+ while(task->acquire_tile(this, tile)) {
+ device_memory data;
+ split_kernel.path_trace(task, tile,
data);
+
+ task->release_tile(tile);
+
+ if(task->get_cancel()) {
+ if(task->need_finish_queue ==
false)
+ break;
+ }
+ }
}
}
else if(task->type == DeviceTask::SHADER) {
@@ -1335,6 +1373,177 @@ public:
{
task_pool.cancel();
}
+
+ /* split kernel */
+ class CUDASplitKernelFunction : public SplitKernelFunction{
+ CUDADevice* device;
+ CUfunction func;
+ public:
+ CUDASplitKernelFunction(CUDADevice *device, CUfunction func) :
device(device), func(func) {}
+
+ /* enqueue the kernel, returns false if there is an error */
+ bool enqueue(const KernelDimensions &dim, device_memory
&/*kg*/, device_memory &/*data*/)
+ {
+ return device->enqueue_split_kernel_function(dim, func,
NULL);
+ }
+ };
+
+ bool enqueue_split_kernel_function(const KernelDimensions &dim,
CUfunction func, void *args[]) {
+ cuda_push_context();
+
+ if(have_error())
+ return false;
+
+ /* we ignore dim.local_size for now, as this is faster */
+ int threads_per_block;
+ cuda_assert(cuFuncGetAttribute(&threads_per_block,
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
+
+ int xthreads = (int)sqrt(threads_per_block);
+ int ythreads = (int)sqrt(threads_per_block);
+
+ int xblocks = (dim.global_size[0] + xthreads - 1)/xthreads;
+ int yblocks = (dim.global_size[1] + ythreads - 1)/ythreads;
+
+ cuda_assert(cuFuncSetCacheConfig(func,
CU_FUNC_CACHE_PREFER_L1));
+
+ cuda_assert(cuLaunchKernel(func,
+ xblocks , yblocks, 1, /* blocks */
+ xthreads, ythreads, 1, /* threads */
+ 0, 0, args, 0));
+
+ cuda_pop_context();
+
+ return !have_error();
+ }
+
+ bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
+ RenderTile& rtile,
+ int num_global_elements,
+ int num_parallel_samples,
+ device_memory& /*kernel_globals*/,
+ device_memory& /*kernel_data*/,
+ device_memory& split_data,
+ device_memory& ray_state,
+ device_memory& queue_index,
+ device_memory& use_queues_flag,
+ device_memory& work_pool_wgs)
+ {
+ cuda_push_context();
+
+ CUdeviceptr d_split_data =
cuda_device_ptr(split_data.device_pointer);
+ CUdeviceptr d_ray_state =
cuda_device_ptr(ray_state.device_pointer);
+ CUdeviceptr d_queue_index =
cuda_device_ptr(queue_index.device_pointer);
+ CUdeviceptr d_use_queues_flag =
cuda_device_ptr(use_queues_flag.device_pointer);
+ CUdeviceptr d_work_pool_wgs =
cuda_device_ptr(work_pool_wgs.device_pointer);
+
+ CUdeviceptr d_rng_state = cuda_device_ptr(rtile.rng_state);
+ CUdeviceptr d_buffer = cuda_device_ptr(rtile.buffer);
+
+ int end_sample = rtile.start_sample + rtile.num_samples;
+ int queue_size = dim.global_size[0] * dim.global_size[1];
+
+ struct args_t {
+ CUdeviceptr* split_data_buffer;
+ int* num_elements;
+ CUdeviceptr* ray_state;
+ CUdeviceptr* rng_state;
+ int* start_sample;
+ int*
@@ Diff output truncated at 10240 characters. @@
_______________________________________________
Bf-blender-cvs mailing list
[email protected]
https://lists.blender.org/mailman/listinfo/bf-blender-cvs