Commit: d8c7cec415da34bc9e1687527263dae92414270f
Author: Mai Lavelle
Date: Tue Jan 24 05:51:26 2017 -0500
Branches: cycles_split_kernel
https://developer.blender.org/rBd8c7cec415da34bc9e1687527263dae92414270f
Cycles: Rework work stealing
Previous implementation of work stealing didn't do what it claimed to do
and made some odd assumptions. This new implementation is cleaner,
organizes work in a slightly more optimal way, and most importantly
decouples work from tile size, which will allow for greater speed ups.
===================================================================
M intern/cycles/device/device_split_kernel.cpp
M intern/cycles/kernel/kernel_types.h
M intern/cycles/kernel/kernel_work_stealing.h
M intern/cycles/kernel/split/kernel_background_buffer_update.h
M intern/cycles/kernel/split/kernel_data_init.h
M
intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
M intern/cycles/kernel/split/kernel_lamp_emission.h
M intern/cycles/kernel/split/kernel_scene_intersect.h
M intern/cycles/kernel/split/kernel_split_data.h
M intern/cycles/kernel/split/kernel_sum_all_radiance.h
===================================================================
diff --git a/intern/cycles/device/device_split_kernel.cpp
b/intern/cycles/device/device_split_kernel.cpp
index 484416d297..1954c3ccaa 100644
--- a/intern/cycles/device/device_split_kernel.cpp
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -115,9 +115,9 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
/* Make sure that set render feasible tile size is a multiple of local
* work size dimensions.
*/
- int2 max_render_feasible_tile_size;
- max_render_feasible_tile_size.x = round_up(task->requested_tile_size.x,
local_size[0]);
- max_render_feasible_tile_size.y = round_up(task->requested_tile_size.y,
local_size[1]);
+ size_t global_size[2];
+ global_size[0] = round_up(task->requested_tile_size.x, local_size[0]);
+ global_size[1] = round_up(task->requested_tile_size.y, local_size[1]);
/* Calculate per_thread_output_buffer_size. */
size_t per_thread_output_buffer_size;
@@ -143,12 +143,8 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
}
}
- /* set global_size */
- size_t global_size[2] = {round_up(tile.w, local_size[0]),
round_up(tile.h, local_size[1])};
- assert(global_size[0] * global_size[1] <=
max_render_feasible_tile_size.x * max_render_feasible_tile_size.y);
-
/* Number of elements in the global state buffer */
- int num_global_elements = max_render_feasible_tile_size.x *
max_render_feasible_tile_size.y;
+ int num_global_elements = global_size[0] * global_size[1];
/* Allocate all required global memory once. */
if(first_tile) {
@@ -157,8 +153,7 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
/* Calculate max groups */
/* Denotes the maximum work groups possible w.r.t. current
requested tile size. */
- unsigned int max_work_groups = (max_render_feasible_tile_size.x
* max_render_feasible_tile_size.y) /
- (local_size[0] * local_size[1]);
+ unsigned int max_work_groups = num_global_elements /
WORK_POOL_SIZE + 1;
/* Allocate work_pool_wgs memory. */
work_pool_wgs.resize(max_work_groups * sizeof(unsigned int));
diff --git a/intern/cycles/kernel/kernel_types.h
b/intern/cycles/kernel/kernel_types.h
index e3806f207f..d442de0b56 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -61,6 +61,8 @@ CCL_NAMESPACE_BEGIN
#define VOLUME_STACK_SIZE 16
+#define WORK_POOL_SIZE 64
+
/* device capabilities */
#ifdef __KERNEL_CPU__
# ifdef __KERNEL_SSE2__
diff --git a/intern/cycles/kernel/kernel_work_stealing.h
b/intern/cycles/kernel/kernel_work_stealing.h
index 8065d2dd91..28fc5ce1c3 100644
--- a/intern/cycles/kernel/kernel_work_stealing.h
+++ b/intern/cycles/kernel/kernel_work_stealing.h
@@ -27,160 +27,90 @@ CCL_NAMESPACE_BEGIN
# pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#endif
-ccl_device uint get_group_id_with_ray_index(uint ray_index,
- uint tile_dim_x,
- uint tile_dim_y,
- int dim)
+ccl_device_inline uint kernel_total_work_size(KernelGlobals *kg)
{
- if(dim == 0) {
- uint x_span = ray_index % tile_dim_x;
- return x_span / ccl_local_size(0);
+ return kernel_split_params.w * kernel_split_params.h *
kernel_split_params.num_samples;
+}
+
+ccl_device_inline uint kernel_num_work_pools(KernelGlobals *kg)
+{
+ return ccl_global_size(0) * ccl_global_size(1) / WORK_POOL_SIZE;
+}
+
+ccl_device_inline uint work_pool_from_ray_index(KernelGlobals *kg, uint
ray_index)
+{
+ return ray_index / WORK_POOL_SIZE;
+}
+
+ccl_device_inline uint work_pool_work_size(KernelGlobals *kg, uint work_pool)
+{
+ uint total_work_size = kernel_total_work_size(kg);
+ uint num_pools = kernel_num_work_pools(kg);
+
+ if(work_pool >= num_pools || work_pool * WORK_POOL_SIZE >=
total_work_size) {
+ return 0;
}
- else /*if(dim == 1)*/ {
- kernel_assert(dim == 1);
- uint y_span = ray_index / tile_dim_x;
- return y_span / ccl_local_size(1);
+
+ uint work_size = (total_work_size / (num_pools * WORK_POOL_SIZE)) *
WORK_POOL_SIZE;
+
+ uint remainder = (total_work_size % (num_pools * WORK_POOL_SIZE));
+ if(work_pool < remainder / WORK_POOL_SIZE) {
+ work_size += WORK_POOL_SIZE;
+ }
+ else if(work_pool == remainder / WORK_POOL_SIZE) {
+ work_size += remainder % WORK_POOL_SIZE;
}
+
+ return work_size;
}
-ccl_device uint get_total_work(KernelGlobals *kg,
- uint tile_dim_x,
- uint tile_dim_y,
- uint grp_idx,
- uint grp_idy,
- uint num_samples)
+ccl_device_inline uint get_global_work_index(KernelGlobals *kg, uint
work_index, uint ray_index)
{
- uint threads_within_tile_border_x =
- (grp_idx == (ccl_num_groups(0) - 1)) ? tile_dim_x %
ccl_local_size(0)
- : ccl_local_size(0);
- uint threads_within_tile_border_y =
- (grp_idy == (ccl_num_groups(1) - 1)) ? tile_dim_y %
ccl_local_size(1)
- : ccl_local_size(1);
-
- threads_within_tile_border_x =
- (threads_within_tile_border_x == 0) ? ccl_local_size(0)
- :
threads_within_tile_border_x;
- threads_within_tile_border_y =
- (threads_within_tile_border_y == 0) ? ccl_local_size(1)
- :
threads_within_tile_border_y;
-
- return threads_within_tile_border_x *
- threads_within_tile_border_y *
- num_samples;
+ uint num_pools = kernel_num_work_pools(kg);
+ uint pool = work_pool_from_ray_index(kg, ray_index);
+
+ return (work_index / WORK_POOL_SIZE) * (num_pools * WORK_POOL_SIZE)
+ + (pool * WORK_POOL_SIZE)
+ + (work_index % WORK_POOL_SIZE);
}
-/* Returns 0 in case there is no next work available */
-/* Returns 1 in case work assigned is valid */
-ccl_device int get_next_work(KernelGlobals *kg,
- ccl_global uint *work_pool,
- ccl_private uint *my_work,
- uint tile_dim_x,
- uint tile_dim_y,
- uint num_samples,
- uint ray_index)
+/* Returns true if there is work */
+ccl_device bool get_next_work(KernelGlobals *kg, ccl_private uint *work_index,
uint ray_index)
{
- uint grp_idx = get_group_id_with_ray_index(ray_index,
- tile_dim_x,
- tile_dim_y,
- 0);
- uint grp_idy = get_group_id_with_ray_index(ray_index,
- tile_dim_x,
- tile_dim_y,
- 1);
- uint total_work = get_total_work(kg,
- tile_dim_x,
- tile_dim_y,
- grp_idx,
- grp_idy,
- num_samples);
- uint group_index = grp_idy * ccl_num_groups(0) + grp_idx;
- *my_work = atomic_fetch_and_inc_uint32(&work_pool[group_index]);
- return (*my_work < total_work) ? 1 : 0;
+ uint work_pool = work_pool_from_ray_index(kg, ray_index);
+ uint pool_size = work_pool_work_size(kg, work_pool);
+
+ if(pool_size == 0) {
+ return false;
+ }
+
+ *work_index =
atomic_fetch_and_inc_uint32(&kernel_split_params.work_pools[work_pool]);
+ return (*work_index < pool_size);
}
-/* This function assumes that the passed my_work is valid. */
-/* Decode sample number w.r.t. assigned my_work. */
-ccl_device uint get_my_sample(KernelGlobals *kg,
- uint my_work,
- uint tile_dim_x,
- uint tile_dim_y,
- uint ray_index)
+/* This function assumes that the passed `work` is valid. */
+/* Decode sample number w.r.t. assigned `work`. */
+ccl_device uint get_work_sample(KernelGlobals *kg, uint work_index, uint
ray_index)
{
- uint grp_idx = get_group_id_with_ray_index(ray_index,
- tile_dim_x,
- tile_dim_y,
- 0);
- uint grp_idy = get_group_id_with_ray_index(ray_index,
- tile_dim_x,
- tile_dim_y,
- 1);
- uint threads_within_tile_border_x =
- (grp_idx == (ccl_num_groups(0) - 1)) ? tile_dim_x %
ccl_local_size(0)
- : ccl_local_size(0);
- uint threads_within_tile_border_y =
- (grp_idy == (ccl_num_groups(1) - 1)) ? tile_dim_y %
ccl_local_size(1)
- : ccl_local_size(1);
-
- threads_within_tile_border_x =
- (threads_within_tile_border_x == 0) ? ccl_local_size(0)
- :
threads_within_tile_border_x;
- threads_within_tile_border_y =
- (threads_within_tile_border_y == 0) ? ccl_local_size(1)
- :
threads_within_tile_border_y;
-
- return my_work /
- (threads_within_tile_border_x * threads_within_tile_border_y);
+ return get_global_work_index(kg, work_index, ray_index) /
(kernel_split_params.w * kernel_split_params.h);
}
-/* Decode pixel and tile position w.r.t. assigned my_work. */
-ccl_device void get_pixel_tile_position(KernelGlobals *kg,
+/* Decode pixel and tile position w.r.t. assigned `work`. */
+ccl_device void get_work_pixel_tile_position(KernelGlobals *kg,
ccl_private uint *pixel_x,
ccl_private uint *pixel_y,
ccl_private uint *tile_x,
ccl_private uint *tile_y,
- uint my_work,
- uint tile_dim_x,
- uint tile_dim_y,
- uint tile_offset_x,
- uint tile_offset_y,
+ uint work_index,
uint ray_index)
{
- uint grp_idx = get_group_id_with_ray_index(ray_index,
- tile_dim_x,
- tile_dim_y,
- 0);
- uint grp_idy = get_g
@@ Diff output truncated at 10240 characters. @@
_______________________________________________
Bf-blender-cvs mailing list
[email protected]
https://lists.blender.org/mailman/listinfo/bf-blender-cvs