[Bf-blender-cvs] [584bbd7] soc-2016-cycles_denoising: Cycles: Get rid of one loop over all denoised pixels by using backsubstitution instead of explicit inversion

Lukas Stockner Mon, 21 Nov 2016 19:26:28 -0800

Commit: 584bbd7e1e5cdb8271f0fd972524e4f59f34976e
Author: Lukas Stockner
Date:   Tue Nov 22 03:16:47 2016 +0100
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rB584bbd7e1e5cdb8271f0fd972524e4f59f34976e


Cycles: Get rid of one loop over all denoised pixels by using backsubstitution 
instead of explicit inversion

===================================================================

M       intern/cycles/kernel/filter/filter_final_pass_impl.h

===================================================================

diff --git a/intern/cycles/kernel/filter/filter_final_pass_impl.h 
b/intern/cycles/kernel/filter/filter_final_pass_impl.h
index 9e2b9b1..4276867 100644
--- a/intern/cycles/kernel/filter/filter_final_pass_impl.h
+++ b/intern/cycles/kernel/filter/filter_final_pass_impl.h
@@ -82,8 +82,10 @@ ccl_device void FUNCTION_NAME(KernelGlobals *kg, int sample, 
float ccl_readonly_
         * S = inv(Xt*W*X)*Xt*W*y */
 
        float XtWX[(DENOISE_FEATURES+1)*(DENOISE_FEATURES+1)], 
design_row[DENOISE_FEATURES+1];
+       float3 solution[(DENOISE_FEATURES+1)];
 
        math_matrix_zero(XtWX, matrix_size);
+       math_vec3_zero(solution, matrix_size);
        /* Construct Xt*W*X matrix (and fill weight cache, if used). */
        FOR_PIXEL_WINDOW {
                float3 color = filter_get_pixel_color(pixel_buffer + 
color_passes.x, pass_stride);
@@ -118,82 +120,20 @@ ccl_device void FUNCTION_NAME(KernelGlobals *kg, int 
sample, float ccl_readonly_
                weight_cache[cache_idx] = weight;
 
                math_add_gramian(XtWX, matrix_size, design_row, weight);
+               math_add_vec3(solution, matrix_size, design_row, weight * 
color);
        } END_FOR_PIXEL_WINDOW
 
-       /* Invert Xt*W*X. Since it is symmetric and semi-positive-definite by 
construction, this can be done fairly efficiently.
-        * Step 1: Apply cholesky decomposition: Find L so that L*Lt = Xt*W*X */
+       /* Solve S = inv(Xt*W*X)*Xt*W*y.
+        * Instead of explicitly inverting Xt*W*X, we rearrange to:
+        * (Xt*W*X)*S = Wt*W*y
+        * Xt*W*X is per definition symmetric positive-semidefinite, so we can 
apply Cholesky decomposition to find a lower triangular L so that L*Lt = Xt*W*X.
+        * With, that we get (L*Lt)*S = L*(Lt*S) = L*b = Wt*W*y.
+        * Since L is lower triangular, finding b (=Lt*S) is relatively easy.
+        * Then, the remaining problem is Lt*S = b, which also can be solved 
easily. */
        math_matrix_add_diagonal(XtWX, matrix_size, 1e-4f); /* Improve the 
numerical stability. */
-       math_cholesky(XtWX, matrix_size);
-       /* Step 2: Invert L in-place, replacing it with inv(L)t. */
-       math_inverse_lower_tri_inplace(XtWX, matrix_size);
-       for(int row = 0; row < matrix_size; row++)
-               for(int col = 0; col < row; col++)
-                       XtWX[row*matrix_size+col] = 0.0f;
-
-       /* Step 3: Find inv(Xt*W*X) = inv(L*Lt) = inv(Lt)*inv(L) = 
inv(L)t*inv(L).
-        * If the gradient of the solution isn't used, only the first row is 
filled. */
-       int solution_size = kernel_data.integrator.use_gradients? matrix_size : 
1;
-       float XtWXinv[(DENOISE_FEATURES+1)*(DENOISE_FEATURES+1)];
-       if(kernel_data.integrator.use_gradients)
-               math_matrix_zero(XtWXinv, matrix_size);
-       else
-               math_vector_zero(XtWXinv, matrix_size);
-       for(int i = 0; i < solution_size; i++)
-               for(int col = 0; col < matrix_size; col++)
-                       for(int row = 0; row < matrix_size; row++)
-                               XtWXinv[i*matrix_size+col] += 
XtWX[i*matrix_size+row]*XtWX[col*matrix_size+row];
-
-       /* Compute the full solution vector (if the gradient is used) or its 
first element (otherwise). */
-       float3 solution[DENOISE_FEATURES+1];
-       math_vec3_zero(solution, solution_size);
-       FOR_PIXEL_WINDOW {
-               float weight;
-               float3 color;
-               /* If we're using CPU weight caching, the weight is always 
cached - read it, check early-out and fill design row.
-                * If we're using GPU weight caching, the weight might be 
cached - if yes, same as above, otherwise recalculate it.
-                * If we're not using weight caching, always recalculate it. */
-#if defined(WEIGHTING_CACHING_CPU) || defined(WEIGHTING_CACHING_CUDA)
-#  ifdef WEIGHTING_CACHING_CUDA
-               if(cache_idx < CUDA_WEIGHT_CACHE_SIZE)
-#  endif
-               {
-                       weight = weight_cache[cache_idx];
-                       if(weight == 0.0f) continue;
-                       color = filter_get_pixel_color(pixel_buffer + 
color_passes.x, pass_stride);
-#  ifdef WEIGHTING_NFOR
-                       filter_get_design_row(pixel, pixel_buffer, 
feature_means, feature_scales, pass_stride, design_row);
-#  else
-                       filter_get_design_row_transform(pixel, pixel_buffer, 
feature_means, pass_stride, features, rank, design_row, transform, 
transform_stride);
-#  endif
-               }
-#  ifdef WEIGHTING_CACHING_CUDA
-               else
-#  endif
-#endif
-#ifndef WEIGHTING_CACHING_CPU
-               {
-                       color = filter_get_pixel_color(pixel_buffer + 
color_passes.x, pass_stride);
-                       float variance = filter_get_pixel_variance(pixel_buffer 
+ color_passes.x, pass_stride);
-                       if(filter_firefly_rejection(color, variance, 
center_color, sqrt_center_variance)) continue;
-#  ifdef WEIGHTING_WLR
-                       weight = filter_get_design_row_transform_weight(pixel, 
pixel_buffer, feature_means, pass_stride, features, rank, design_row, 
transform, transform_stride, bandwidth_factor);
-#  elif defined(WEIGHTING_NLM)
-                       filter_get_design_row_transform(pixel, pixel_buffer, 
feature_means, pass_stride, features, rank, design_row, transform, 
transform_stride);
-                       weight = nlm_weight(x, y, pixel.x, pixel.y, 
center_buffer + color_passes.y, pixel_buffer + color_passes.y, pass_stride, 
1.0f, kernel_data.integrator.weighting_adjust, 4, rect);
-#  else /* WEIGHTING_NFOR */
-                       filter_get_design_row(pixel, pixel_buffer, 
feature_means, feature_scales, pass_stride, design_row);
-                       weight = nlm_weight(x, y, pixel.x, pixel.y, 
center_buffer + color_passes.y, pixel_buffer + color_passes.y, pass_stride, 
1.0f, kernel_data.integrator.weighting_adjust, 4, rect);
-#  endif
-                       if(weight == 0.0f) continue;
-                       weight /= max(1.0f, variance);
-               }
-#endif
-
-               for(int i = 0; i < solution_size; i++) {
-                       float XtWXinvXt = math_dot(XtWXinv + i*matrix_size, 
design_row, matrix_size);
-                       solution[i] += XtWXinvXt*weight*color;
-               }
-       } END_FOR_PIXEL_WINDOW
+       math_cholesky(XtWX, matrix_size); /* Find L so that L*Lt = Xt*W*X. */
+       math_substitute_forward_vec3(XtWX, matrix_size, solution); /* Solve L*b 
= X^T*y, replacing X^T*y by b. */
+       math_substitute_back_vec3(XtWX, matrix_size, solution); /* Solve L^T*S 
= b, replacing b by S. */
 
        if(kernel_data.integrator.use_gradients) {
                FOR_PIXEL_WINDOW {

_______________________________________________
Bf-blender-cvs mailing list
[email protected]
https://lists.blender.org/mailman/listinfo/bf-blender-cvs

[Bf-blender-cvs] [584bbd7] soc-2016-cycles_denoising: Cycles: Get rid of one loop over all denoised pixels by using backsubstitution instead of explicit inversion

Reply via email to