Commit: dde40989f34634f43fb561416728c438dfb62f0b Author: Sergey Sharybin Date: Wed Sep 21 17:46:25 2016 +0200 Branches: master https://developer.blender.org/rBdde40989f34634f43fb561416728c438dfb62f0b
Cycles: Store shadow intersections in the kernel globals Seems CUDA failed to de-duplicate the array across multiple inlined versions of the shadow_blocked(). Helped it a bit with that now. Gives about 100MB memory improvement on a scenes after previous commit and brings up memory "regression" to only 100MB comparing to the master branch now. =================================================================== M intern/cycles/kernel/kernel_globals.h M intern/cycles/kernel/kernel_shadow.h M intern/cycles/kernel/kernels/cuda/kernel.cu =================================================================== diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h index 8e66a3a034..2b52a2d2f4 100644 --- a/intern/cycles/kernel/kernel_globals.h +++ b/intern/cycles/kernel/kernel_globals.h @@ -76,7 +76,10 @@ typedef struct KernelGlobals { #ifdef __KERNEL_CUDA__ __constant__ KernelData __data; -typedef struct KernelGlobals {} KernelGlobals; +typedef struct KernelGlobals { + /* NOTE: Keep the size in sync with SHADOW_STACK_MAX_HITS. */ + Intersection hits_stack[64]; +} KernelGlobals; # ifdef __KERNEL_CUDA_TEX_STORAGE__ # define KERNEL_TEX(type, ttype, name) ttype name; diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h index 05a6c7d182..e69eac6ab8 100644 --- a/intern/cycles/kernel/kernel_shadow.h +++ b/intern/cycles/kernel/kernel_shadow.h @@ -109,8 +109,12 @@ ccl_device_inline bool shadow_blocked_all(KernelGlobals *kg, /* Intersect to find an opaque surface, or record all transparent * surface hits. */ +#ifdef __KERNEL_CUDA__ + Intersection *hits = kg->hits_stack; +#else Intersection hits_stack[SHADOW_STACK_MAX_HITS]; Intersection *hits = hits_stack; +#endif const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce; uint max_hits = transparent_max_bounce - state->transparent_bounce - 1; #ifndef __KERNEL_GPU__ @@ -247,6 +251,7 @@ ccl_device_noinline bool shadow_blocked_stepped(KernelGlobals *kg, for(;;) { if(bounce >= kernel_data.integrator.transparent_max_bounce) { return true; + } if(!scene_intersect(kg, *ray, PATH_RAY_SHADOW_TRANSPARENT, diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu index eb2b6ea541..090ab2c50c 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel.cu +++ b/intern/cycles/kernel/kernels/cuda/kernel.cu @@ -130,8 +130,10 @@ kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int s int x = sx + blockDim.x*blockIdx.x + threadIdx.x; int y = sy + blockDim.y*blockIdx.y + threadIdx.y; - if(x < sx + sw && y < sy + sh) - kernel_path_trace(NULL, buffer, rng_state, sample, x, y, offset, stride); + if(x < sx + sw && y < sy + sh) { + KernelGlobals kg; + kernel_path_trace(&kg, buffer, rng_state, sample, x, y, offset, stride); + } } #ifdef __BRANCHED_PATH__ @@ -142,8 +144,10 @@ kernel_cuda_branched_path_trace(float *buffer, uint *rng_state, int sample, int int x = sx + blockDim.x*blockIdx.x + threadIdx.x; int y = sy + blockDim.y*blockIdx.y + threadIdx.y; - if(x < sx + sw && y < sy + sh) - kernel_branched_path_trace(NULL, buffer, rng_state, sample, x, y, offset, stride); + if(x < sx + sw && y < sy + sh) { + KernelGlobals kg; + kernel_branched_path_trace(&kg, buffer, rng_state, sample, x, y, offset, stride); + } } #endif @@ -154,8 +158,9 @@ kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int int x = sx + blockDim.x*blockIdx.x + threadIdx.x; int y = sy + blockDim.y*blockIdx.y + threadIdx.y; - if(x < sx + sw && y < sy + sh) + if(x < sx + sw && y < sy + sh) { kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride); + } } extern "C" __global__ void @@ -165,8 +170,9 @@ kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scal int x = sx + blockDim.x*blockIdx.x + threadIdx.x; int y = sy + blockDim.y*blockIdx.y + threadIdx.y; - if(x < sx + sw && y < sy + sh) + if(x < sx + sw && y < sy + sh) { kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride); + } } extern "C" __global__ void @@ -183,7 +189,8 @@ kernel_cuda_shader(uint4 *input, int x = sx + blockDim.x*blockIdx.x + threadIdx.x; if(x < sx + sw) { - kernel_shader_evaluate(NULL, + KernelGlobals kg; + kernel_shader_evaluate(&kg, input, output, output_luma, @@ -200,8 +207,10 @@ kernel_cuda_bake(uint4 *input, float4 *output, int type, int filter, int sx, int { int x = sx + blockDim.x*blockIdx.x + threadIdx.x; - if(x < sx + sw) - kernel_bake_evaluate(NULL, input, output, (ShaderEvalType)type, filter, x, offset, sample); + if(x < sx + sw) { + KernelGlobals kg; + kernel_bake_evaluate(&kg, input, output, (ShaderEvalType)type, filter, x, offset, sample); + } } #endif _______________________________________________ Bf-blender-cvs mailing list Bf-blender-cvs@blender.org https://lists.blender.org/mailman/listinfo/bf-blender-cvs