Commit: 8a72be7697f8fbfc8cb6cc9f3df049104e41d4a6 Author: Brecht Van Lommel Date: Wed Nov 1 21:02:28 2017 +0100 Branches: master https://developer.blender.org/rB8a72be7697f8fbfc8cb6cc9f3df049104e41d4a6
Cycles: reduce closure memory usage for emission/shadow shader data. With a Titan Xp, reduces path trace local memory from 1092MB to 840MB. Benchmark performance was within 1% with both RX 480 and Titan Xp. Original patch was implemented by Sergey. Differential Revision: https://developer.blender.org/D2249 =================================================================== M intern/cycles/kernel/closure/alloc.h M intern/cycles/kernel/kernel_bake.h M intern/cycles/kernel/kernel_emission.h M intern/cycles/kernel/kernel_path.h M intern/cycles/kernel/kernel_path_branched.h M intern/cycles/kernel/kernel_shader.h M intern/cycles/kernel/kernel_shadow.h M intern/cycles/kernel/kernel_subsurface.h M intern/cycles/kernel/kernel_types.h M intern/cycles/kernel/kernel_volume.h M intern/cycles/kernel/split/kernel_buffer_update.h M intern/cycles/kernel/split/kernel_direct_lighting.h M intern/cycles/kernel/split/kernel_do_volume.h M intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h M intern/cycles/kernel/split/kernel_path_init.h M intern/cycles/kernel/split/kernel_shader_eval.h M intern/cycles/kernel/split/kernel_shadow_blocked_ao.h M intern/cycles/kernel/split/kernel_shadow_blocked_dl.h M intern/cycles/kernel/split/kernel_split_data_types.h M intern/cycles/kernel/split/kernel_subsurface_scatter.h =================================================================== diff --git a/intern/cycles/kernel/closure/alloc.h b/intern/cycles/kernel/closure/alloc.h index e799855a65e..48a60405b5a 100644 --- a/intern/cycles/kernel/closure/alloc.h +++ b/intern/cycles/kernel/closure/alloc.h @@ -20,17 +20,16 @@ ccl_device ShaderClosure *closure_alloc(ShaderData *sd, int size, ClosureType ty { kernel_assert(size <= sizeof(ShaderClosure)); - int num_closure = sd->num_closure; - int num_closure_extra = sd->num_closure_extra; - if(num_closure + num_closure_extra >= MAX_CLOSURE) + if(sd->num_closure_left == 0) return NULL; - ShaderClosure *sc = &sd->closure[num_closure]; + ShaderClosure *sc = &sd->closure[sd->num_closure]; sc->type = type; sc->weight = weight; sd->num_closure++; + sd->num_closure_left--; return sc; } @@ -44,18 +43,16 @@ ccl_device ccl_addr_space void *closure_alloc_extra(ShaderData *sd, int size) * This lets us keep the same fast array iteration over closures, as we * found linked list iteration and iteration with skipping to be slower. */ int num_extra = ((size + sizeof(ShaderClosure) - 1) / sizeof(ShaderClosure)); - int num_closure = sd->num_closure; - int num_closure_extra = sd->num_closure_extra + num_extra; - if(num_closure + num_closure_extra > MAX_CLOSURE) { + if(num_extra > sd->num_closure_left) { /* Remove previous closure. */ sd->num_closure--; - sd->num_closure_extra++; + sd->num_closure_left++; return NULL; } - sd->num_closure_extra = num_closure_extra; - return (ccl_addr_space void*)(sd->closure + MAX_CLOSURE - num_closure_extra); + sd->num_closure_left -= num_extra; + return (ccl_addr_space void*)(sd->closure + sd->num_closure + sd->num_closure_left); } ccl_device_inline ShaderClosure *bsdf_alloc(ShaderData *sd, int size, float3 weight) diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h index 84d8d84d486..9ce10358b81 100644 --- a/intern/cycles/kernel/kernel_bake.h +++ b/intern/cycles/kernel/kernel_bake.h @@ -51,7 +51,7 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg, path_state_init(kg, &emission_sd, &state, rng_hash, sample, NULL); /* evaluate surface shader */ - shader_eval_surface(kg, sd, &state, state.flag); + shader_eval_surface(kg, sd, &state, state.flag, MAX_CLOSURE); /* TODO, disable more closures we don't need besides transparent */ shader_bsdf_disable_transparency(kg, sd); @@ -239,12 +239,12 @@ ccl_device float3 kernel_bake_evaluate_direct_indirect(KernelGlobals *kg, } else { /* surface color of the pass only */ - shader_eval_surface(kg, sd, state, 0); + shader_eval_surface(kg, sd, state, 0, MAX_CLOSURE); return kernel_bake_shader_bsdf(kg, sd, type); } } else { - shader_eval_surface(kg, sd, state, 0); + shader_eval_surface(kg, sd, state, 0, MAX_CLOSURE); color = kernel_bake_shader_bsdf(kg, sd, type); } @@ -337,7 +337,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, { float3 N = sd.N; if((sd.flag & SD_HAS_BUMP)) { - shader_eval_surface(kg, &sd, &state, 0); + shader_eval_surface(kg, &sd, &state, 0, MAX_CLOSURE); N = shader_bsdf_average_normal(kg, &sd); } @@ -352,7 +352,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, } case SHADER_EVAL_EMISSION: { - shader_eval_surface(kg, &sd, &state, 0); + shader_eval_surface(kg, &sd, &state, 0, 0); out = shader_emissive_eval(kg, &sd); break; } diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h index 45b8c6311e1..94b0a37ce62 100644 --- a/intern/cycles/kernel/kernel_emission.h +++ b/intern/cycles/kernel/kernel_emission.h @@ -70,14 +70,11 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg, /* no path flag, we're evaluating this for all closures. that's weak but * we'd have to do multiple evaluations otherwise */ path_state_modify_bounce(state, true); - shader_eval_surface(kg, emission_sd, state, 0); + shader_eval_surface(kg, emission_sd, state, 0, 0); path_state_modify_bounce(state, false); /* evaluate emissive closure */ - if(emission_sd->flag & SD_EMISSION) - eval = shader_emissive_eval(kg, emission_sd); - else - eval = make_float3(0.0f, 0.0f, 0.0f); + eval = shader_emissive_eval(kg, emission_sd); } eval *= ls->eval_fac; diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h index 1099064038b..8519e0682e1 100644 --- a/intern/cycles/kernel/kernel_path.h +++ b/intern/cycles/kernel/kernel_path.h @@ -443,7 +443,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, sd, &isect, ray); - shader_eval_surface(kg, sd, state, state->flag); + shader_eval_surface(kg, sd, state, state->flag, MAX_CLOSURE); shader_prepare_closures(sd, state); /* Apply shadow catcher, holdout, emission. */ @@ -561,7 +561,7 @@ ccl_device_forceinline void kernel_path_integrate( bool hit = kernel_path_scene_intersect(kg, state, ray, &isect, L); /* Find intersection with lamps and compute emission for MIS. */ - kernel_path_lamp_emission(kg, state, ray, throughput, &isect, emission_sd, L); + kernel_path_lamp_emission(kg, state, ray, throughput, &isect, &sd, L); #ifdef __VOLUME__ /* Volume integration. */ @@ -585,7 +585,7 @@ ccl_device_forceinline void kernel_path_integrate( /* Shade background. */ if(!hit) { - kernel_path_background(kg, state, ray, throughput, emission_sd, L); + kernel_path_background(kg, state, ray, throughput, &sd, L); break; } else if(path_state_ao_bounce(kg, state)) { @@ -594,7 +594,7 @@ ccl_device_forceinline void kernel_path_integrate( /* Setup and evaluate shader. */ shader_setup_from_ray(kg, &sd, &isect, ray); - shader_eval_surface(kg, &sd, state, state->flag); + shader_eval_surface(kg, &sd, state, state->flag, MAX_CLOSURE); shader_prepare_closures(&sd, state); /* Apply shadow catcher, holdout, emission. */ @@ -706,9 +706,11 @@ ccl_device void kernel_path_trace(KernelGlobals *kg, PathRadiance L; path_radiance_init(&L, kernel_data.film.use_light_pass); - ShaderData emission_sd; + ShaderDataTinyStorage emission_sd_storage; + ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage); + PathState state; - path_state_init(kg, &emission_sd, &state, rng_hash, sample, &ray); + path_state_init(kg, emission_sd, &state, rng_hash, sample, &ray); /* Integrate. */ kernel_path_integrate(kg, @@ -717,7 +719,7 @@ ccl_device void kernel_path_trace(KernelGlobals *kg, &ray, &L, buffer, - &emission_sd); + emission_sd); kernel_write_result(kg, buffer, sample, &L); } diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h index 3877e4f0058..f93366eade1 100644 --- a/intern/cycles/kernel/kernel_path_branched.h +++ b/intern/cycles/kernel/kernel_path_branched.h @@ -436,10 +436,12 @@ ccl_device void kernel_branched_path_integrate(KernelGlobals *kg, /* shader data memory used for both volumes and surfaces, saves stack space */ ShaderData sd; /* shader data used by emission, shadows, volume stacks, indirect path */ - ShaderData emission_sd, indirect_sd; + ShaderDataTinyStorage emission_sd_storage; + ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage); + ShaderData indirect_sd; PathState state; - path_state_init(kg, &emission_sd, &state, rng_hash, sample, &ray); + path_state_init(kg, emission_sd, &state, rng_hash, sample, &ray); /* Main Loop * Here we only handle transparency intersections from the camera ray. @@ -460,7 +462,7 @@ ccl_device void kernel_branched_path_integrate(KernelGlobals *kg, &isect, hit, &indirect_sd, - &emission_sd, + emission_sd, L); #endif /* __VOLUME__ */ @@ -472,7 +474,7 @@ ccl_device void kernel_branched_path_integrate(KernelGlobals *kg, /* Setup and evaluate shader. */ shader_setup_from_ray(kg, &sd, &isect, &ray); - shader_eval_surface(kg, &sd, &state, state.flag); + shader_eval_surface(kg, &sd, &state, state.flag, MAX_CLOSURE); shader_merge_closures(&sd); /* Apply shadow catcher, holdout, emission. */ @@ -481,7 +483,7 @@ ccl_device void kernel_branched_path_integrate(KernelGlobals *kg, &state, &ray, throughput, - &emission_sd, + emission_sd, L, buffer)) { @@ -513,14 +515,14 @@ ccl_device void kernel_branched_path_integrate(KernelGlobals *kg, #ifdef __AO__ /* ambient occlusion */ if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) { - kernel_branched_path_ao(kg, &sd, &emission_sd, L, &state, throughput); + kernel_branched_path_ao(kg, &sd, emission_sd, L, &state, throughput); } #endif /* __AO__ */ #ifdef __SUBSURFACE__ /* bssrdf scatter to a different location on the same object */ if(sd.flag & SD_BSSRDF) { - kernel_branched_path_subsurface_scatter(kg, &sd, &indirect_sd, &emission_sd, + kernel_branched_path_subsurface_scatter(kg, &sd, &indirect_sd, emission_sd, L, &state, &ray, throughput); } #endif /* __SUBSURFACE__ */ @@ -534,13 +536,13 @@ ccl_device void kernel_branched_path_integrate(KernelGlobals *kg, int all = (kernel_data.integrator.sample_all_lights_direct) || (state.flag & PATH_RAY_SHADOW_CATCHER); kernel_branched_path_surface_connect_light(kg, - &sd, &emission_sd, &hit_state, throughput, 1.0f, L, all); + &sd, emission_sd, &hit_state, throughput, 1.0f, L, all); } #endif /* __EMISSION__ */ /* indirect light */ kernel_branched_path_surface_indirect_light(kg, - &sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, L); + &sd, &indirect_sd, emissio @@ Diff output truncated at 10240 characters. @@ _______________________________________________ Bf-blender-cvs mailing list [email protected] https://lists.blender.org/mailman/listinfo/bf-blender-cvs
