[Bf-blender-cvs] [d8c7cec415] cycles_split_kernel: Cycles: Rework work stealing

Mai Lavelle Tue, 24 Jan 2017 04:34:11 -0800

Commit: d8c7cec415da34bc9e1687527263dae92414270f
Author: Mai Lavelle
Date:   Tue Jan 24 05:51:26 2017 -0500
Branches: cycles_split_kernel
https://developer.blender.org/rBd8c7cec415da34bc9e1687527263dae92414270f


Cycles: Rework work stealing

Previous implementation of work stealing didn't do what it claimed to do
and made some odd assumptions. This new implementation is cleaner,
organizes work in a slightly more optimal way, and most importantly
decouples work from tile size, which will allow for greater speed ups.

===================================================================

M       intern/cycles/device/device_split_kernel.cpp
M       intern/cycles/kernel/kernel_types.h
M       intern/cycles/kernel/kernel_work_stealing.h
M       intern/cycles/kernel/split/kernel_background_buffer_update.h
M       intern/cycles/kernel/split/kernel_data_init.h
M       
intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
M       intern/cycles/kernel/split/kernel_lamp_emission.h
M       intern/cycles/kernel/split/kernel_scene_intersect.h
M       intern/cycles/kernel/split/kernel_split_data.h
M       intern/cycles/kernel/split/kernel_sum_all_radiance.h

===================================================================

diff --git a/intern/cycles/device/device_split_kernel.cpp 
b/intern/cycles/device/device_split_kernel.cpp
index 484416d297..1954c3ccaa 100644
--- a/intern/cycles/device/device_split_kernel.cpp
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -115,9 +115,9 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
        /* Make sure that set render feasible tile size is a multiple of local
         * work size dimensions.
         */
-       int2 max_render_feasible_tile_size;
-       max_render_feasible_tile_size.x = round_up(task->requested_tile_size.x, 
local_size[0]);
-       max_render_feasible_tile_size.y = round_up(task->requested_tile_size.y, 
local_size[1]);
+       size_t global_size[2];
+       global_size[0] = round_up(task->requested_tile_size.x, local_size[0]);
+       global_size[1] = round_up(task->requested_tile_size.y, local_size[1]);
 
        /* Calculate per_thread_output_buffer_size. */
        size_t per_thread_output_buffer_size;
@@ -143,12 +143,8 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
                }
        }
 
-       /* set global_size */
-       size_t global_size[2] = {round_up(tile.w, local_size[0]), 
round_up(tile.h, local_size[1])};
-       assert(global_size[0] * global_size[1] <= 
max_render_feasible_tile_size.x * max_render_feasible_tile_size.y);
-
        /* Number of elements in the global state buffer */
-       int num_global_elements = max_render_feasible_tile_size.x * 
max_render_feasible_tile_size.y;
+       int num_global_elements = global_size[0] * global_size[1];
 
        /* Allocate all required global memory once. */
        if(first_tile) {
@@ -157,8 +153,7 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
                /* Calculate max groups */
 
                /* Denotes the maximum work groups possible w.r.t. current 
requested tile size. */
-               unsigned int max_work_groups = (max_render_feasible_tile_size.x 
* max_render_feasible_tile_size.y) /
-                                 (local_size[0] * local_size[1]);
+               unsigned int max_work_groups = num_global_elements / 
WORK_POOL_SIZE + 1;
 
                /* Allocate work_pool_wgs memory. */
                work_pool_wgs.resize(max_work_groups * sizeof(unsigned int));
diff --git a/intern/cycles/kernel/kernel_types.h 
b/intern/cycles/kernel/kernel_types.h
index e3806f207f..d442de0b56 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -61,6 +61,8 @@ CCL_NAMESPACE_BEGIN
 
 #define VOLUME_STACK_SIZE              16
 
+#define WORK_POOL_SIZE 64
+
 /* device capabilities */
 #ifdef __KERNEL_CPU__
 #  ifdef __KERNEL_SSE2__
diff --git a/intern/cycles/kernel/kernel_work_stealing.h 
b/intern/cycles/kernel/kernel_work_stealing.h
index 8065d2dd91..28fc5ce1c3 100644
--- a/intern/cycles/kernel/kernel_work_stealing.h
+++ b/intern/cycles/kernel/kernel_work_stealing.h
@@ -27,160 +27,90 @@ CCL_NAMESPACE_BEGIN
 #  pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
 #endif
 
-ccl_device uint get_group_id_with_ray_index(uint ray_index,
-                                 uint tile_dim_x,
-                                 uint tile_dim_y,
-                                 int dim)
+ccl_device_inline uint kernel_total_work_size(KernelGlobals *kg)
 {
-       if(dim == 0) {
-               uint x_span = ray_index % tile_dim_x;
-               return x_span / ccl_local_size(0);
+       return kernel_split_params.w * kernel_split_params.h * 
kernel_split_params.num_samples;
+}
+
+ccl_device_inline uint kernel_num_work_pools(KernelGlobals *kg)
+{
+       return ccl_global_size(0) * ccl_global_size(1) / WORK_POOL_SIZE;
+}
+
+ccl_device_inline uint work_pool_from_ray_index(KernelGlobals *kg, uint 
ray_index)
+{
+       return ray_index / WORK_POOL_SIZE;
+}
+
+ccl_device_inline uint work_pool_work_size(KernelGlobals *kg, uint work_pool)
+{
+       uint total_work_size = kernel_total_work_size(kg);
+       uint num_pools = kernel_num_work_pools(kg);
+
+       if(work_pool >= num_pools || work_pool * WORK_POOL_SIZE >= 
total_work_size) {
+               return 0;
        }
-       else /*if(dim == 1)*/ {
-               kernel_assert(dim == 1);
-               uint y_span = ray_index / tile_dim_x;
-               return y_span / ccl_local_size(1);
+
+       uint work_size = (total_work_size / (num_pools * WORK_POOL_SIZE)) * 
WORK_POOL_SIZE;
+
+       uint remainder = (total_work_size % (num_pools * WORK_POOL_SIZE));
+       if(work_pool < remainder / WORK_POOL_SIZE) {
+               work_size += WORK_POOL_SIZE;
+       }
+       else if(work_pool == remainder / WORK_POOL_SIZE) {
+               work_size += remainder % WORK_POOL_SIZE;
        }
+
+       return work_size;
 }
 
-ccl_device uint get_total_work(KernelGlobals *kg,
-                    uint tile_dim_x,
-                    uint tile_dim_y,
-                    uint grp_idx,
-                    uint grp_idy,
-                    uint num_samples)
+ccl_device_inline uint get_global_work_index(KernelGlobals *kg, uint 
work_index, uint ray_index)
 {
-       uint threads_within_tile_border_x =
-               (grp_idx == (ccl_num_groups(0) - 1)) ? tile_dim_x % 
ccl_local_size(0)
-                                                    : ccl_local_size(0);
-       uint threads_within_tile_border_y =
-               (grp_idy == (ccl_num_groups(1) - 1)) ? tile_dim_y % 
ccl_local_size(1)
-                                                    : ccl_local_size(1);
-
-       threads_within_tile_border_x =
-               (threads_within_tile_border_x == 0) ? ccl_local_size(0)
-                                                   : 
threads_within_tile_border_x;
-       threads_within_tile_border_y =
-               (threads_within_tile_border_y == 0) ? ccl_local_size(1)
-                                                   : 
threads_within_tile_border_y;
-
-       return threads_within_tile_border_x *
-              threads_within_tile_border_y *
-              num_samples;
+       uint num_pools = kernel_num_work_pools(kg);
+       uint pool = work_pool_from_ray_index(kg, ray_index);
+
+       return (work_index / WORK_POOL_SIZE) * (num_pools * WORK_POOL_SIZE)
+              + (pool * WORK_POOL_SIZE)
+              + (work_index % WORK_POOL_SIZE);
 }
 
-/* Returns 0 in case there is no next work available */
-/* Returns 1 in case work assigned is valid */
-ccl_device int get_next_work(KernelGlobals *kg,
-                  ccl_global uint *work_pool,
-                  ccl_private uint *my_work,
-                  uint tile_dim_x,
-                  uint tile_dim_y,
-                  uint num_samples,
-                  uint ray_index)
+/* Returns true if there is work */
+ccl_device bool get_next_work(KernelGlobals *kg, ccl_private uint *work_index, 
uint ray_index)
 {
-       uint grp_idx = get_group_id_with_ray_index(ray_index,
-                                                  tile_dim_x,
-                                                  tile_dim_y,
-                                                  0);
-       uint grp_idy = get_group_id_with_ray_index(ray_index,
-                                                  tile_dim_x,
-                                                  tile_dim_y,
-                                                  1);
-       uint total_work = get_total_work(kg,
-                                        tile_dim_x,
-                                        tile_dim_y,
-                                        grp_idx,
-                                        grp_idy,
-                                        num_samples);
-       uint group_index = grp_idy * ccl_num_groups(0) + grp_idx;
-       *my_work = atomic_fetch_and_inc_uint32(&work_pool[group_index]);
-       return (*my_work < total_work) ? 1 : 0;
+       uint work_pool = work_pool_from_ray_index(kg, ray_index);
+       uint pool_size = work_pool_work_size(kg, work_pool);
+
+       if(pool_size == 0) {
+               return false;
+       }
+
+       *work_index = 
atomic_fetch_and_inc_uint32(&kernel_split_params.work_pools[work_pool]);
+       return (*work_index < pool_size);
 }
 
-/* This function assumes that the passed my_work is valid. */
-/* Decode sample number w.r.t. assigned my_work. */
-ccl_device uint get_my_sample(KernelGlobals *kg,
-                   uint my_work,
-                   uint tile_dim_x,
-                   uint tile_dim_y,
-                   uint ray_index)
+/* This function assumes that the passed `work` is valid. */
+/* Decode sample number w.r.t. assigned `work`. */
+ccl_device uint get_work_sample(KernelGlobals *kg, uint work_index, uint 
ray_index)
 {
-       uint grp_idx = get_group_id_with_ray_index(ray_index,
-                                                  tile_dim_x,
-                                                  tile_dim_y,
-                                                  0);
-       uint grp_idy = get_group_id_with_ray_index(ray_index,
-                                                  tile_dim_x,
-                                                  tile_dim_y,
-                                                  1);
-       uint threads_within_tile_border_x =
-               (grp_idx == (ccl_num_groups(0) - 1)) ? tile_dim_x % 
ccl_local_size(0)
-                                                    : ccl_local_size(0);
-       uint threads_within_tile_border_y =
-               (grp_idy == (ccl_num_groups(1) - 1)) ? tile_dim_y % 
ccl_local_size(1)
-                                                    : ccl_local_size(1);
-
-       threads_within_tile_border_x =
-               (threads_within_tile_border_x == 0) ? ccl_local_size(0)
-                                                   : 
threads_within_tile_border_x;
-       threads_within_tile_border_y =
-               (threads_within_tile_border_y == 0) ? ccl_local_size(1)
-                                                   : 
threads_within_tile_border_y;
-
-       return my_work /
-              (threads_within_tile_border_x * threads_within_tile_border_y);
+       return get_global_work_index(kg, work_index, ray_index) / 
(kernel_split_params.w * kernel_split_params.h);
 }
 
-/* Decode pixel and tile position w.r.t. assigned my_work. */
-ccl_device void get_pixel_tile_position(KernelGlobals *kg,
+/* Decode pixel and tile position w.r.t. assigned `work`. */
+ccl_device void get_work_pixel_tile_position(KernelGlobals *kg,
                              ccl_private uint *pixel_x,
                              ccl_private uint *pixel_y,
                              ccl_private uint *tile_x,
                              ccl_private uint *tile_y,
-                             uint my_work,
-                             uint tile_dim_x,
-                             uint tile_dim_y,
-                             uint tile_offset_x,
-                             uint tile_offset_y,
+                             uint work_index,
                              uint ray_index)
 {
-       uint grp_idx = get_group_id_with_ray_index(ray_index,
-                                                  tile_dim_x,
-                                                  tile_dim_y,
-                                                  0);
-       uint grp_idy = get_g

@@ Diff output truncated at 10240 characters. @@

_______________________________________________
Bf-blender-cvs mailing list
[email protected]
https://lists.blender.org/mailman/listinfo/bf-blender-cvs

[Bf-blender-cvs] [d8c7cec415] cycles_split_kernel: Cycles: Rework work stealing

Reply via email to