Commit: 148795bf0128a1e78f11d8683f2454d3c656d72d Author: Mai Lavelle Date: Wed Feb 22 08:10:02 2017 -0500 Branches: temp_cycles_split_kernel https://developer.blender.org/rB148795bf0128a1e78f11d8683f2454d3c656d72d
Cycles: OpenCL split kernel refactor This does a few things at once: - Refactors host side split kernel logic into a new device agnostic class `DeviceSplitKernel`. - Removes tile splitting, a new work pool implementation takes its place and allows as many threads as will fit in memory regardless of tile size, which can give performance gains. - Refactors split state buffers into one buffer, as well as reduces the number of arguments passed to kernels. Means there's less code to deal with overall. - Moves kernel logic out of OpenCL kernel files so they can later be used by other device types. - Replaced OpenCL specific APIs with new generic versions - Tiles can now be seen updating during rendering =================================================================== M intern/cycles/device/CMakeLists.txt A intern/cycles/device/device_split_kernel.cpp A intern/cycles/device/device_split_kernel.h M intern/cycles/device/opencl/opencl.h M intern/cycles/device/opencl/opencl_base.cpp M intern/cycles/device/opencl/opencl_mega.cpp M intern/cycles/device/opencl/opencl_split.cpp M intern/cycles/kernel/CMakeLists.txt M intern/cycles/kernel/kernel_compat_opencl.h M intern/cycles/kernel/kernel_globals.h M intern/cycles/kernel/kernel_passes.h M intern/cycles/kernel/kernel_queues.h M intern/cycles/kernel/kernel_shadow.h M intern/cycles/kernel/kernel_types.h M intern/cycles/kernel/kernel_work_stealing.h M intern/cycles/kernel/kernels/opencl/kernel.cl M intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl M intern/cycles/kernel/kernels/opencl/kernel_data_init.cl M intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl M intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl M intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl M intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl M intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl M intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl M intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl M intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl M intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl M intern/cycles/kernel/split/kernel_background_buffer_update.h M intern/cycles/kernel/split/kernel_data_init.h M intern/cycles/kernel/split/kernel_direct_lighting.h M intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h M intern/cycles/kernel/split/kernel_lamp_emission.h M intern/cycles/kernel/split/kernel_next_iteration_setup.h A intern/cycles/kernel/split/kernel_queue_enqueue.h M intern/cycles/kernel/split/kernel_scene_intersect.h M intern/cycles/kernel/split/kernel_shader_eval.h M intern/cycles/kernel/split/kernel_shadow_blocked.h M intern/cycles/kernel/split/kernel_split_common.h A intern/cycles/kernel/split/kernel_split_data.h M intern/cycles/kernel/split/kernel_sum_all_radiance.h M intern/cycles/util/util_types.h =================================================================== diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt index 966ff5e52b..a237345169 100644 --- a/intern/cycles/device/CMakeLists.txt +++ b/intern/cycles/device/CMakeLists.txt @@ -3,6 +3,7 @@ set(INC . ../graph ../kernel + ../kernel/split ../kernel/svm ../kernel/osl ../util @@ -33,6 +34,7 @@ set(SRC device_cuda.cpp device_multi.cpp device_opencl.cpp + device_split_kernel.cpp device_task.cpp ) @@ -56,6 +58,7 @@ set(SRC_HEADERS device_memory.h device_intern.h device_network.h + device_split_kernel.h device_task.h ) diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp new file mode 100644 index 0000000000..cf43e499d0 --- /dev/null +++ b/intern/cycles/device/device_split_kernel.cpp @@ -0,0 +1,283 @@ +/* + * Copyright 2011-2016 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device_split_kernel.h" + +#include "kernel_types.h" +#include "kernel_split_data.h" + +#include "util_time.h" + +CCL_NAMESPACE_BEGIN + +static const double alpha = 0.1; /* alpha for rolling average */ + +DeviceSplitKernel::DeviceSplitKernel(Device *device) : device(device) +{ + current_max_closure = -1; + first_tile = true; + + avg_time_per_sample = 0.0; +} + +DeviceSplitKernel::~DeviceSplitKernel() +{ + device->mem_free(split_data); + device->mem_free(ray_state); + device->mem_free(use_queues_flag); + device->mem_free(queue_index); + device->mem_free(work_pool_wgs); + + delete kernel_scene_intersect; + delete kernel_lamp_emission; + delete kernel_queue_enqueue; + delete kernel_background_buffer_update; + delete kernel_shader_eval; + delete kernel_holdout_emission_blurring_pathtermination_ao; + delete kernel_direct_lighting; + delete kernel_shadow_blocked; + delete kernel_next_iteration_setup; + delete kernel_sum_all_radiance; +} + +bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_features) +{ +#define LOAD_KERNEL(name) \ + kernel_##name = get_split_kernel_function(#name, requested_features); \ + if(!kernel_##name) { \ + return false; \ + } + + LOAD_KERNEL(scene_intersect); + LOAD_KERNEL(lamp_emission); + LOAD_KERNEL(queue_enqueue); + LOAD_KERNEL(background_buffer_update); + LOAD_KERNEL(shader_eval); + LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao); + LOAD_KERNEL(direct_lighting); + LOAD_KERNEL(shadow_blocked); + LOAD_KERNEL(next_iteration_setup); + LOAD_KERNEL(sum_all_radiance); + +#undef LOAD_KERNEL + + current_max_closure = requested_features.max_closure; + + return true; +} + +size_t DeviceSplitKernel::max_elements_for_max_buffer_size(size_t max_buffer_size, size_t passes_size) +{ + size_t size_per_element = split_data_buffer_size(1024, current_max_closure, passes_size) / 1024; + return max_buffer_size / size_per_element; +} + +bool DeviceSplitKernel::path_trace(DeviceTask *task, + RenderTile& tile, + device_memory& kgbuffer, + device_memory& kernel_data) +{ + if(device->have_error()) { + return false; + } + + /* Get local size */ + size_t local_size[2]; + { + int2 lsize = split_kernel_local_size(); + local_size[0] = lsize[0]; + local_size[1] = lsize[1]; + } + + /* Calculate per_thread_output_buffer_size. */ + size_t per_thread_output_buffer_size = task->passes_size; + + /* Set gloabl size */ + size_t global_size[2]; + { + int2 gsize = split_kernel_global_size(task); + + /* Make sure that set work size is a multiple of local + * work size dimensions. + */ + global_size[0] = round_up(gsize[0], local_size[0]); + global_size[1] = round_up(gsize[1], local_size[1]); + } + + /* Number of elements in the global state buffer */ + int num_global_elements = global_size[0] * global_size[1]; + + /* Allocate all required global memory once. */ + if(first_tile) { + first_tile = false; + + /* Calculate max groups */ + + /* Denotes the maximum work groups possible w.r.t. current requested tile size. */ + unsigned int max_work_groups = num_global_elements / WORK_POOL_SIZE + 1; + + /* Allocate work_pool_wgs memory. */ + work_pool_wgs.resize(max_work_groups * sizeof(unsigned int)); + device->mem_alloc(work_pool_wgs, MEM_READ_WRITE); + + queue_index.resize(NUM_QUEUES * sizeof(int)); + device->mem_alloc(queue_index, MEM_READ_WRITE); + + use_queues_flag.resize(sizeof(char)); + device->mem_alloc(use_queues_flag, MEM_READ_WRITE); + + ray_state.resize(num_global_elements); + device->mem_alloc(ray_state, MEM_READ_WRITE); + + split_data.resize(split_data_buffer_size(num_global_elements, + current_max_closure, + per_thread_output_buffer_size)); + device->mem_alloc(split_data, MEM_READ_WRITE); + } + +#define ENQUEUE_SPLIT_KERNEL(name, global_size, local_size) \ + if(device->have_error()) { \ + return false; \ + } \ + if(!kernel_##name->enqueue(KernelDimensions(global_size, local_size), kgbuffer, kernel_data)) { \ + return false; \ + } + + tile.sample = tile.start_sample; + + /* for exponential increase between tile updates */ + int time_multiplier = 1; + + while(tile.sample < tile.start_sample + tile.num_samples) { + /* to keep track of how long it takes to run a number of samples */ + double start_time = time_dt(); + + /* initial guess to start rolling average */ + const int initial_num_samples = 1; + /* approx number of samples per second */ + int samples_per_second = (avg_time_per_sample > 0.0) ? + int(double(time_multiplier) / avg_time_per_sample) + 1 : initial_num_samples; + + RenderTile subtile = tile; + subtile.start_sample = tile.sample; + subtile.num_samples = min(samples_per_second, tile.start_sample + tile.num_samples - tile.sample); + + if(device->have_error()) { + return false; + } + + /* reset state memory here as global size for data_init + * kernel might not be large enough to do in kernel + */ + device->mem_zero(work_pool_wgs); + device->mem_zero(split_data); + + if(!enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size), + subtile, + num_global_elements, + kgbuffer, + kernel_data, + split_data, + ray_state, + queue_index, + use_queues_flag, + work_pool_wgs + )) + { + return false; + } + + bool activeRaysAvailable = true; + + while(activeRaysAvailable) { + /* Twice the global work size of other kernels for + * ckPathTraceKernel_shadow_blocked_direct_lighting. */ + size_t global_size_shadow_blocked[2]; + global_size_shadow_blocked[0] = global_size[0] * 2; + global_size_shadow_blocked[1] = global_size[1]; + + /* Do path-iteration in host [Enqueue Path-iteration kernels. */ + for(int PathIter = 0; PathIter < 16; PathIter++) { + ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(background_buffer_update, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(shadow_blocked, global_size_shadow_blocked, local_size); + ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size); + + if(task->get_cancel()) { + return true; + } + } + + /* Decide if we should exit path-iteration in host. */ + device->mem_copy_from(ray_state, 0, global_size[0] * global_size[1] * sizeof(char), 1, 1); + + activeRaysAvailable = false; + + for(int rayStateIter = 0; rayStateIter < global_size[0] * global_size[1]; ++rayStateIter) { + if(int8_t(ray_state.get_data()[rayStateIter]) != RAY_INACTIVE) { + /* Not all rays are RAY_INACTIVE. */ + activeRaysAvailable = true; + break; + } + } + + if(task->get_cancel()) { + return true; + } + } + + double time_per_sample = ((time_dt()-start_time) / subtile.num_samples); + + if(avg_time_per_sample == 0.0) { + /* start rolling average */ + avg_time_per_sample = time_per_sample; + } + else { + avg_time_per_sample = alpha*time_per_sample + (1.0-alpha)*avg_time_per_sample; + } + + size_t sum_all_radiance_local_size[2] = {16, 16}; + size_t sum_all_radiance_global_size[2]; + sum_all_radiance_global_size[0] = round_up(tile.w, sum_all_radiance_local_size[0]); + sum_all_radiance_global_size[1] = round_up(tile.h, sum_all_radiance_local_size[1]); + + ENQUEUE_SPLIT_KERNEL(sum_all_radiance, + sum_all_radiance_global_size, + sum_all_radiance_local_size); + +#undef ENQUEUE_SPLIT_KERNEL + + tile.sample += subtile.num_samples; + task->update_progress(&tile, tile.w*tile.h*subtile.num_samples); + + time_multiplier = min(time_multiplier << 1, 10); + + if(task->get_cancel()) { + return true; + } + } + + return true; +} + +CCL_NAMESPACE_END + + diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h new file mode 100644 index 0000000000..b3106fd563 --- /dev/null +++ b/intern/cycles/device/device_split_ke @@ Diff output truncated at 10240 characters. @@ _______________________________________________ Bf-blender-cvs mailing list Bf-blender-cvs@blender.org https://lists.blender.org/mailman/listinfo/bf-blender-cvs