Commit: dde40989f34634f43fb561416728c438dfb62f0b
Author: Sergey Sharybin
Date:   Wed Sep 21 17:46:25 2016 +0200
Branches: master
https://developer.blender.org/rBdde40989f34634f43fb561416728c438dfb62f0b

Cycles: Store shadow intersections in the kernel globals

Seems CUDA failed to de-duplicate the array across multiple inlined
versions of the shadow_blocked(). Helped it a bit with that now.

Gives about 100MB memory improvement on a scenes after previous
commit and brings up memory "regression" to only 100MB comparing to
the master branch now.

===================================================================

M       intern/cycles/kernel/kernel_globals.h
M       intern/cycles/kernel/kernel_shadow.h
M       intern/cycles/kernel/kernels/cuda/kernel.cu

===================================================================

diff --git a/intern/cycles/kernel/kernel_globals.h 
b/intern/cycles/kernel/kernel_globals.h
index 8e66a3a034..2b52a2d2f4 100644
--- a/intern/cycles/kernel/kernel_globals.h
+++ b/intern/cycles/kernel/kernel_globals.h
@@ -76,7 +76,10 @@ typedef struct KernelGlobals {
 #ifdef __KERNEL_CUDA__
 
 __constant__ KernelData __data;
-typedef struct KernelGlobals {} KernelGlobals;
+typedef struct KernelGlobals {
+       /* NOTE: Keep the size in sync with SHADOW_STACK_MAX_HITS. */
+       Intersection hits_stack[64];
+} KernelGlobals;
 
 #  ifdef __KERNEL_CUDA_TEX_STORAGE__
 #    define KERNEL_TEX(type, ttype, name) ttype name;
diff --git a/intern/cycles/kernel/kernel_shadow.h 
b/intern/cycles/kernel/kernel_shadow.h
index 05a6c7d182..e69eac6ab8 100644
--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@@ -109,8 +109,12 @@ ccl_device_inline bool shadow_blocked_all(KernelGlobals 
*kg,
                /* Intersect to find an opaque surface, or record all 
transparent
                 * surface hits.
                 */
+#ifdef __KERNEL_CUDA__
+               Intersection *hits = kg->hits_stack;
+#else
                Intersection hits_stack[SHADOW_STACK_MAX_HITS];
                Intersection *hits = hits_stack;
+#endif
                const int transparent_max_bounce = 
kernel_data.integrator.transparent_max_bounce;
                uint max_hits = transparent_max_bounce - 
state->transparent_bounce - 1;
 #ifndef __KERNEL_GPU__
@@ -247,6 +251,7 @@ ccl_device_noinline bool 
shadow_blocked_stepped(KernelGlobals *kg,
                        for(;;) {
                                if(bounce >= 
kernel_data.integrator.transparent_max_bounce) {
                                        return true;
+                               }
                                if(!scene_intersect(kg,
                                                    *ray,
                                                    PATH_RAY_SHADOW_TRANSPARENT,
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu 
b/intern/cycles/kernel/kernels/cuda/kernel.cu
index eb2b6ea541..090ab2c50c 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -130,8 +130,10 @@ kernel_cuda_path_trace(float *buffer, uint *rng_state, int 
sample, int sx, int s
        int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
        int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
 
-       if(x < sx + sw && y < sy + sh)
-               kernel_path_trace(NULL, buffer, rng_state, sample, x, y, 
offset, stride);
+       if(x < sx + sw && y < sy + sh) {
+               KernelGlobals kg;
+               kernel_path_trace(&kg, buffer, rng_state, sample, x, y, offset, 
stride);
+       }
 }
 
 #ifdef __BRANCHED_PATH__
@@ -142,8 +144,10 @@ kernel_cuda_branched_path_trace(float *buffer, uint 
*rng_state, int sample, int
        int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
        int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
 
-       if(x < sx + sw && y < sy + sh)
-               kernel_branched_path_trace(NULL, buffer, rng_state, sample, x, 
y, offset, stride);
+       if(x < sx + sw && y < sy + sh) {
+               KernelGlobals kg;
+               kernel_branched_path_trace(&kg, buffer, rng_state, sample, x, 
y, offset, stride);
+       }
 }
 #endif
 
@@ -154,8 +158,9 @@ kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, 
float sample_scale, int
        int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
        int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
 
-       if(x < sx + sw && y < sy + sh)
+       if(x < sx + sw && y < sy + sh) {
                kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, 
x, y, offset, stride);
+       }
 }
 
 extern "C" __global__ void
@@ -165,8 +170,9 @@ kernel_cuda_convert_to_half_float(uchar4 *rgba, float 
*buffer, float sample_scal
        int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
        int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
 
-       if(x < sx + sw && y < sy + sh)
+       if(x < sx + sw && y < sy + sh) {
                kernel_film_convert_to_half_float(NULL, rgba, buffer, 
sample_scale, x, y, offset, stride);
+       }
 }
 
 extern "C" __global__ void
@@ -183,7 +189,8 @@ kernel_cuda_shader(uint4 *input,
        int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 
        if(x < sx + sw) {
-               kernel_shader_evaluate(NULL,
+               KernelGlobals kg;
+               kernel_shader_evaluate(&kg,
                                       input,
                                       output,
                                       output_luma,
@@ -200,8 +207,10 @@ kernel_cuda_bake(uint4 *input, float4 *output, int type, 
int filter, int sx, int
 {
        int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 
-       if(x < sx + sw)
-               kernel_bake_evaluate(NULL, input, output, (ShaderEvalType)type, 
filter, x, offset, sample);
+       if(x < sx + sw) {
+               KernelGlobals kg;
+               kernel_bake_evaluate(&kg, input, output, (ShaderEvalType)type, 
filter, x, offset, sample);
+       }
 }
 #endif

_______________________________________________
Bf-blender-cvs mailing list
Bf-blender-cvs@blender.org
https://lists.blender.org/mailman/listinfo/bf-blender-cvs

Reply via email to