[Bf-blender-cvs] [b4e7747] soc-2016-cycles_denoising: Cycles: Move the final_pass functions to a unified implementation and generally shuffle a lot of code around

Lukas Stockner Mon, 21 Nov 2016 19:26:28 -0800

Commit: b4e774722fc4f2e064e41117bd17132d98f1a61e
Author: Lukas Stockner
Date:   Tue Nov 22 04:11:16 2016 +0100
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rBb4e774722fc4f2e064e41117bd17132d98f1a61e


Cycles: Move the final_pass functions to a unified implementation and generally 
shuffle a lot of code around

===================================================================

M       intern/cycles/device/device_cpu.cpp
M       intern/cycles/kernel/CMakeLists.txt
A       intern/cycles/kernel/filter/filter_final_pass_impl.h
M       intern/cycles/kernel/kernel_filter.h
M       intern/cycles/kernel/kernel_filter_util.h
M       intern/cycles/kernel/kernels/cpu/kernel_cpu.h
M       intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
M       intern/cycles/kernel/kernels/cuda/kernel.cu
M       intern/cycles/util/util_atomic.h
M       intern/cycles/util/util_math.h
M       intern/cycles/util/util_math_matrix.h

===================================================================

diff --git a/intern/cycles/device/device_cpu.cpp 
b/intern/cycles/device/device_cpu.cpp
index fcce0ff..61c1e75 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -139,7 +139,7 @@ public:
        KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, 
int)>                                                  
filter_combine_halves_kernel;
        KernelFunctions<void(*)(KernelGlobals*, int, float*, int, int, void*, 
int*)>                                      filter_construct_transform_kernel;
        KernelFunctions<void(*)(KernelGlobals*, int, float*, int, int, void*, 
int*)>                                      filter_estimate_wlr_params_kernel;
-       KernelFunctions<void(*)(KernelGlobals*, int, float*, int, int, int, 
int, float*, void*, int*, int*)>              filter_final_pass_wlr_kernel;
+       KernelFunctions<void(*)(KernelGlobals*, int, float*, int, int, int, 
int, float*, void*, float*, int*, int*)>      filter_final_pass_wlr_kernel;
        KernelFunctions<void(*)(KernelGlobals*, int, float*, int, int, int, 
int, float*, void*, float*, int*, int*)>      filter_final_pass_nlm_kernel;
        KernelFunctions<void(*)(int, int, float**, float**, float**, float**, 
int*, int, int, float, float)>              filter_non_local_means_3_kernel;
        KernelFunctions<void(*)(KernelGlobals*, float*, int, int, int, int, 
float, float*, int*)>                         filter_old_1_kernel;
@@ -453,8 +453,9 @@ public:
                bool use_gradients = kg->__data.integrator.use_gradients;
                bool nlm_weights = kg->__data.integrator.use_nlm_weights;
 
-               FilterStorage *storage = new 
FilterStorage[filter_area.z*filter_area.w];
                int hw = kg->__data.integrator.half_window;
+               FilterStorage *storage = new 
FilterStorage[filter_area.z*filter_area.w];
+               float *weight_cache = new float[(2*hw+1)*(2*hw+1)];
 
                int w = align_up(rect.z - rect.x, 4), h = (rect.w - rect.y);
                int pass_stride = w*h;
@@ -511,14 +512,12 @@ public:
                        }
                }
                else if(nlm_weights) {
-                       float *weight_cache = new float[(2*hw+1)*(2*hw+1)];
                        for(int y = 0; y < filter_area.w; y++) {
                                for(int x = 0; x < filter_area.z; x++) {
                                        filter_construct_transform_kernel()(kg, 
sample, filter_buffer, x + filter_area.x, y + filter_area.y, storage + 
y*filter_area.z + x, &rect.x);
                                        filter_final_pass_nlm_kernel()(kg, 
sample, filter_buffer, x + filter_area.x, y + filter_area.y, offset, stride, 
buffers, storage + y*filter_area.z + x, weight_cache, &filter_area.x, &rect.x);
                                }
                        }
-                       delete[] weight_cache;
                }
                else {
                        for(int y = 0; y < filter_area.w; y++) {
@@ -541,7 +540,7 @@ public:
 #endif
                        for(int y = 0; y < filter_area.w; y++) {
                                for(int x = 0; x < filter_area.z; x++) {
-                                       filter_final_pass_wlr_kernel()(kg, 
sample, filter_buffer, x + filter_area.x, y + filter_area.y, offset, stride, 
buffers, storage + y*filter_area.z + x, &filter_area.x, &rect.x);
+                                       filter_final_pass_wlr_kernel()(kg, 
sample, filter_buffer, x + filter_area.x, y + filter_area.y, offset, stride, 
buffers, storage + y*filter_area.z + x, weight_cache, &filter_area.x, &rect.x);
                                }
                        }
 #ifdef WITH_CYCLES_DEBUG_FILTER
@@ -561,6 +560,7 @@ public:
                }
 
                delete[] storage;
+               delete[] weight_cache;
        }
 
        void thread_render(DeviceTask& task)
diff --git a/intern/cycles/kernel/CMakeLists.txt 
b/intern/cycles/kernel/CMakeLists.txt
index 9d9764b..84a0636 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -209,6 +209,10 @@ set(SRC_SPLIT_HEADERS
        split/kernel_sum_all_radiance.h
 )
 
+set(SRC_FILTER_HEADERS
+       filter/filter_final_pass_impl.h
+)
+
 # CUDA module
 
 if(WITH_CYCLES_CUDA_BINARIES)
@@ -346,6 +350,7 @@ add_library(cycles_kernel
        ${SRC_SVM_HEADERS}
        ${SRC_GEOM_HEADERS}
        ${SRC_SPLIT_HEADERS}
+       ${SRC_FILTER_HEADERS}
 )
 
 if(WITH_CYCLES_CUDA)
@@ -382,4 +387,5 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} 
"${SRC_SVM_HEADERS}" ${CYCLES_INSTAL
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" 
${CYCLES_INSTALL_PATH}/kernel/geom)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" 
${CYCLES_INSTALL_PATH}/kernel)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SPLIT_HEADERS}" 
${CYCLES_INSTALL_PATH}/kernel/split)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_FILTER_HEADERS}" 
${CYCLES_INSTALL_PATH}/kernel/filter)
 
diff --git a/intern/cycles/kernel/filter/filter_final_pass_impl.h 
b/intern/cycles/kernel/filter/filter_final_pass_impl.h
new file mode 100644
index 0000000..9e2b9b1
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_final_pass_impl.h
@@ -0,0 +1,276 @@
+#ifdef __KERNEL_CUDA__
+#define STORAGE_TYPE CUDAFilterStorage
+#else
+#define STORAGE_TYPE FilterStorage
+#endif
+
+ccl_device void FUNCTION_NAME(KernelGlobals *kg, int sample, float 
ccl_readonly_ptr buffer, int x, int y, int offset, int stride, float *buffers, 
int filtered_passes, int2 color_passes, STORAGE_TYPE *storage, float 
*weight_cache, float ccl_readonly_ptr transform, int transform_stride, int4 
filter_area, int4 rect)
+{
+       int buffer_w = align_up(rect.z - rect.x, 4);
+       int buffer_h = (rect.w - rect.y);
+       int pass_stride = buffer_h * buffer_w * kernel_data.film.num_frames;
+       color_passes *= pass_stride;
+       int num_frames = kernel_data.film.num_frames;
+       int prev_frames = kernel_data.film.prev_frames;
+
+       int2 low  = make_int2(max(rect.x, x - 
kernel_data.integrator.half_window),
+                             max(rect.y, y - 
kernel_data.integrator.half_window));
+       int2 high = make_int2(min(rect.z, x + 
kernel_data.integrator.half_window + 1),
+                             min(rect.w, y + 
kernel_data.integrator.half_window + 1));
+
+       float ccl_readonly_ptr pixel_buffer;
+       float ccl_readonly_ptr center_buffer = buffer + (y - rect.y) * buffer_w 
+ (x - rect.x);
+       int3 pixel;
+
+       float3 center_color  = filter_get_pixel_color(center_buffer + 
color_passes.x, pass_stride);
+       float sqrt_center_variance = 
sqrtf(filter_get_pixel_variance(center_buffer + color_passes.x, pass_stride));
+
+       /* NFOR weighting directly writes to the design row, so it doesn't need 
the feature vector and always uses full rank. */
+#ifndef WEIGHTING_NFOR
+#  ifdef __KERNEL_CUDA__
+       /* On GPUs, store the feature vector in shared memory for faster 
access. */
+       __shared__ float 
shared_features[DENOISE_FEATURES*CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH];
+       float *features = shared_features + 
DENOISE_FEATURES*(threadIdx.y*blockDim.x + threadIdx.x);
+#  else
+       float features[DENOISE_FEATURES];
+#  endif
+       const int rank = storage->rank;
+       const int matrix_size = rank+1;
+#else
+       const int matrix_size = DENOISE_FEATURES;
+       float *feature_scales = transform;
+#endif
+
+       float feature_means[DENOISE_FEATURES];
+       filter_get_features(make_int3(x, y, 0), center_buffer, feature_means, 
NULL, pass_stride);
+
+#ifdef WEIGHTING_WLR
+       /* Apply a median filter to the 3x3 window aroung the current pixel. */
+       int sort_idx = 0;
+       float global_bandwidths[9];
+       for(int dy = max(-1, filter_area.y - y); dy < min(2, 
filter_area.y+filter_area.w - y); dy++) {
+               for(int dx = max(-1, filter_area.x - x); dx < min(2, 
filter_area.x+filter_area.z - x); dx++) {
+                       int ofs = dy*filter_area.z + dx;
+                       if(storage[ofs].rank != rank) continue;
+                       global_bandwidths[sort_idx++] = 
storage[ofs].global_bandwidth;
+               }
+       }
+       /* Insertion-sort the global bandwidths (fast enough for 9 elements). */
+       for(int i = 1; i < sort_idx; i++) {
+               float v = global_bandwidths[i];
+               int j;
+               for(j = i-1; j >= 0 && global_bandwidths[j] > v; j--)
+                       global_bandwidths[j+1] = global_bandwidths[j];
+               global_bandwidths[j+1] = v;
+       }
+       float inv_global_bandwidth = 1.0f / (global_bandwidths[sort_idx/2] * 
kernel_data.integrator.weighting_adjust);
+
+       float bandwidth_factor[DENOISE_FEATURES];
+       for(int i = 0; i < rank; i++) {
+               /* Same as above, divide by the bandwidth since the 
bandwidth_factor actually is the inverse of the bandwidth. */
+               bandwidth_factor[i] = storage->bandwidth[i] * 
inv_global_bandwidth;
+       }
+#endif
+
+       /* Essentially, this function is just a first-order regression solver.
+        * We model the pixel color as a linear function of the feature vectors.
+        * So, we search the parameters S that minimize W*(X*S - y), where:
+        * - X is the design matrix containing all the feature vectors
+        * - y is the vector containing all the pixel colors
+        * - W is the diagonal matrix containing all pixel weights
+        * Since this is just regular least-squares, the solution is given by:
+        * S = inv(Xt*W*X)*Xt*W*y */
+
+       float XtWX[(DENOISE_FEATURES+1)*(DENOISE_FEATURES+1)], 
design_row[DENOISE_FEATURES+1];
+
+       math_matrix_zero(XtWX, matrix_size);
+       /* Construct Xt*W*X matrix (and fill weight cache, if used). */
+       FOR_PIXEL_WINDOW {
+               float3 color = filter_get_pixel_color(pixel_buffer + 
color_passes.x, pass_stride);
+               float variance = filter_get_pixel_variance(pixel_buffer + 
color_passes.x, pass_stride);
+               if(filter_firefly_rejection(color, variance, center_color, 
sqrt_center_variance)) {
+#ifdef WEIGHT_CACHING_CUDA
+                       if(cache_idx < CUDA_WEIGHT_CACHE_SIZE) 
weight_cache[cache_idx] = 0.0f;
+#elif defined(WEIGHT_CACHING_CPU)
+                       weight_cache[cache_idx] = 0.0f;
+#endif
+                       continue;
+               }
+
+#ifdef WEIGHTING_WLR
+               float weight = filter_get_design_row_transform_weight(pixel, 
pixel_buffer, feature_means, pass_stride, features, rank, design_row, 
transform, transform_stride, bandwidth_factor);
+#elif defined(WEIGHTING_NLM)
+               filter_get_design_row_transform(pixel, pixel_buffer, 
feature_means, pass_stride, features, rank, design_row, transform, 
transform_stride);
+               float weight = nlm_weight(x, y, pixel.x, pixel.y, center_buffer 
+ color_passes.y, pixel_buffer + color_passes.y, pass_stride, 1.0f, 
kernel_data.integrator.weighting_adjust, 4, rect);
+#else /* WEIGHTING_NFOR */
+               filter_get_design_row(pixel, pixel_buffer, feature_means, 
feature_scales, pass_stride, design_row);
+               float weight = nlm_weight(x, y, pixel.x, pixel.y, center_buffer 
+ color_passes.y, pixel_buffer + color_passes.y, pass_stride, 1.0f, 
kernel_data.integrator.weighting_adjust, 4, rect);
+#endif
+               if(weight < 1e-5f) {
+#ifdef WEIGHT_CACHING_CUDA
+                       if(cache_idx < CUDA_WEIGHT_CACHE_SIZE) 
weight_cache[cache_idx] = 0.0f;
+#elif defined(WEIGHT_CACHING_CPU)
+                       weight_cache[cache_idx]

@@ Diff output truncated at 10240 characters. @@

_______________________________________________
Bf-blender-cvs mailing list
[email protected]
https://lists.blender.org/mailman/listinfo/bf-blender-cvs

[Bf-blender-cvs] [b4e7747] soc-2016-cycles_denoising: Cycles: Move the final_pass functions to a unified implementation and generally shuffle a lot of code around

Reply via email to