[Bf-blender-cvs] [466bd61857] soc-2016-cycles_denoising: Cycles: Implement new NLM kernels for CUDA

Lukas Stockner Thu, 19 Jan 2017 20:47:10 -0800

Commit: 466bd61857f61b73b8006db8ccc2e78799f4ef30
Author: Lukas Stockner
Date:   Fri Jan 13 00:09:09 2017 +0100
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rB466bd61857f61b73b8006db8ccc2e78799f4ef30


Cycles: Implement new NLM kernels for CUDA

===================================================================

M       intern/cycles/device/device_cpu.cpp
M       intern/cycles/device/device_cuda.cpp
M       intern/cycles/kernel/CMakeLists.txt
M       intern/cycles/kernel/filter/filter.h
R098    intern/cycles/kernel/filter/filter_nlm.h        
intern/cycles/kernel/filter/filter_nlm_cpu.h
A       intern/cycles/kernel/filter/filter_nlm_gpu.h
M       intern/cycles/kernel/kernels/cuda/kernel.cu

===================================================================

diff --git a/intern/cycles/device/device_cpu.cpp 
b/intern/cycles/device/device_cpu.cpp
index a596097f15..dfc6995e53 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -336,7 +336,7 @@ public:
                                /* Reuse some passes of the filter_buffer for 
temporary storage. */
                                float *sampleV = PASSPTR(0), *sampleVV = 
PASSPTR(1), *bufferV = PASSPTR(2), *cleanV = PASSPTR(3);
                                float *unfiltered = PASSPTR(4), *unfilteredB = 
PASSPTR(5);
-                               float *diffI = PASSPTR(10), *blurDiffI = 
PASSPTR(11), *accumI = PASSPTR(12);
+                               float *nlm_temp1 = PASSPTR(10), *nlm_temp2 = 
PASSPTR(11), *nlm_temp3 = PASSPTR(12);
 
                                /* Get the A/B unfiltered passes, the combined 
sample variance, the estimated variance of the sample variance and the buffer 
variance. */
                                for(int y = rect.y; y < rect.w; y++) {
@@ -354,14 +354,14 @@ public:
 #endif
 
                                /* Smooth the (generally pretty noisy) buffer 
variance using the spatial information from the sample variance. */
-                               non_local_means(rect, bufferV, sampleV, cleanV, 
sampleVV, diffI, blurDiffI, accumI, 6, 3, 4.0f, 1.0f);
+                               non_local_means(rect, bufferV, sampleV, cleanV, 
sampleVV, nlm_temp1, nlm_temp2, nlm_temp3, 6, 3, 4.0f, 1.0f);
 #ifdef WITH_CYCLES_DEBUG_FILTER
                                WRITE_DEBUG("cleanV", cleanV);
 #endif
 
                                /* Use the smoothed variance to filter the two 
shadow half images using each other for weight calculation. */
-                               non_local_means(rect, unfiltered, unfilteredB, 
sampleV, cleanV, diffI, blurDiffI, accumI, 5, 3, 1.0f, 0.25f);
-                               non_local_means(rect, unfilteredB, unfiltered, 
bufferV, cleanV, diffI, blurDiffI, accumI, 5, 3, 1.0f, 0.25f);
+                               non_local_means(rect, unfiltered, unfilteredB, 
sampleV, cleanV, nlm_temp1, nlm_temp2, nlm_temp3, 5, 3, 1.0f, 0.25f);
+                               non_local_means(rect, unfilteredB, unfiltered, 
bufferV, cleanV, nlm_temp1, nlm_temp2, nlm_temp3, 5, 3, 1.0f, 0.25f);
 #ifdef WITH_CYCLES_DEBUG_FILTER
                                WRITE_DEBUG("filteredA", sampleV);
                                WRITE_DEBUG("filteredB", bufferV);
@@ -378,8 +378,8 @@ public:
 #endif
 
                                /* Use the residual variance for a second 
filter pass. */
-                               non_local_means(rect, sampleV, bufferV, 
unfiltered , sampleVV, diffI, blurDiffI, accumI, 4, 2, 1.0f, 0.5f);
-                               non_local_means(rect, bufferV, sampleV, 
unfilteredB, sampleVV, diffI, blurDiffI, accumI, 4, 2, 1.0f, 0.5f);
+                               non_local_means(rect, sampleV, bufferV, 
unfiltered , sampleVV, nlm_temp1, nlm_temp2, nlm_temp3, 4, 2, 1.0f, 0.5f);
+                               non_local_means(rect, bufferV, sampleV, 
unfilteredB, sampleVV, nlm_temp1, nlm_temp2, nlm_temp3, 4, 2, 1.0f, 0.5f);
 #ifdef WITH_CYCLES_DEBUG_FILTER
                                WRITE_DEBUG("finalA", unfiltered);
                                WRITE_DEBUG("finalB", unfiltered + pass_stride);
@@ -403,7 +403,7 @@ public:
                        {
 
                                float *unfiltered = PASSPTR(16);
-                               float *diffI = PASSPTR(17), *blurDiffI = 
PASSPTR(18), *accumI = PASSPTR(19);
+                               float *nlm_temp1 = PASSPTR(17), *nlm_temp2 = 
PASSPTR(18), *nlm_temp3 = PASSPTR(19);
                                /* Order in render buffers:
                                 *   Normal[X, Y, Z] NormalVar[X, Y, Z] 
Albedo[R, G, B] AlbedoVar[R, G, B ] Depth DepthVar
                                 *          0  1  2            3  4  5         
6  7  8            9  10 11  12    13
@@ -423,7 +423,7 @@ public:
                                                        
filter_get_feature_kernel()(kg, sample, buffer, mean_from[i], variance_from[i], 
x, y, tile_x, tile_y, offsets, strides, unfiltered, PASSPTR(offset_to[i]+1), 
&rect.x);
                                                }
                                        }
-                                       non_local_means(rect, unfiltered, 
unfiltered, PASSPTR(offset_to[i]), PASSPTR(offset_to[i]+1), diffI, blurDiffI, 
accumI, 2, 2, 1, 0.25f);
+                                       non_local_means(rect, unfiltered, 
unfiltered, PASSPTR(offset_to[i]), PASSPTR(offset_to[i]+1), nlm_temp1, 
nlm_temp2, nlm_temp3, 2, 2, 1, 0.25f);
 #ifdef WITH_CYCLES_DEBUG_FILTER
 #define WRITE_DEBUG(name, var) debug.add_pass(string_printf("f%d_%s", i, 
name), var, 1, w);
                                        WRITE_DEBUG("unfiltered", unfiltered);
diff --git a/intern/cycles/device/device_cuda.cpp 
b/intern/cycles/device/device_cuda.cpp
index 27e5b7af5b..5d347aea22 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -830,6 +830,76 @@ public:
                }
        }
 
+       void non_local_means(int4 rect, CUdeviceptr image, CUdeviceptr weight, 
CUdeviceptr out, CUdeviceptr variance, CUdeviceptr difference, CUdeviceptr 
blurDifference, CUdeviceptr weightAccum, int r, int f, float a, float k_2) {
+               int w = align_up(rect.z-rect.x, 4);
+               int h = rect.w-rect.y;
+
+               cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*w*h));
+               cuda_assert(cuMemsetD8(out, 0, sizeof(float)*w*h));
+
+               CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, 
cuNLMUpdateOutput, cuNLMNormalize;
+               cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuModule, 
"kernel_cuda_filter_nlm_calc_difference"));
+               cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuModule, 
"kernel_cuda_filter_nlm_blur"));
+               cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuModule, 
"kernel_cuda_filter_nlm_calc_weight"));
+               cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput, cuModule, 
"kernel_cuda_filter_nlm_update_output"));
+               cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuModule, 
"kernel_cuda_filter_nlm_normalize"));
+
+               cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, 
CU_FUNC_CACHE_PREFER_L1));
+               cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, 
CU_FUNC_CACHE_PREFER_L1));
+               cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, 
CU_FUNC_CACHE_PREFER_L1));
+               cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, 
CU_FUNC_CACHE_PREFER_L1));
+               cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, 
CU_FUNC_CACHE_PREFER_L1));
+
+               int threads_per_block;
+               cuda_assert(cuFuncGetAttribute(&threads_per_block, 
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuNLMCalcDifference));
+
+               int xthreads = (int)sqrt((float)threads_per_block);
+               int ythreads = (int)sqrt((float)threads_per_block);
+               int xblocks = ((rect.z-rect.x) + xthreads - 1)/xthreads;
+               int yblocks = ((rect.w-rect.y) + ythreads - 1)/ythreads;
+
+               int dx, dy;
+               int4 local_rect;
+               void *calc_difference_args[] = {&dx, &dy, &weight, &variance, 
&difference, &local_rect, &w, &a, &k_2};
+               void *blur_args[] = {&difference, &blurDifference, &local_rect, 
&w, &f};
+               void *calc_weight_args[] = {&blurDifference, &difference, 
&local_rect, &w, &f};
+               void *update_output_args[] = {&dx, &dy, &blurDifference, 
&image, &out, &weightAccum, &local_rect, &w, &f};
+
+               for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+                       dy = i / (2*r+1) - r;
+                       dx = i % (2*r+1) - r;
+                       local_rect = make_int4(max(0, -dx), max(0, -dy), 
rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
+
+                       cuda_assert(cuLaunchKernel(cuNLMCalcDifference,
+                                                  xblocks , yblocks, 1, /* 
blocks */
+                                                  xthreads, ythreads, 1, /* 
threads */
+                                                  0, 0, calc_difference_args, 
0));
+                       cuda_assert(cuLaunchKernel(cuNLMBlur,
+                                                  xblocks , yblocks, 1, /* 
blocks */
+                                                  xthreads, ythreads, 1, /* 
threads */
+                                                  0, 0, blur_args, 0));
+                       cuda_assert(cuLaunchKernel(cuNLMCalcWeight,
+                                                  xblocks , yblocks, 1, /* 
blocks */
+                                                  xthreads, ythreads, 1, /* 
threads */
+                                                  0, 0, calc_weight_args, 0));
+                       cuda_assert(cuLaunchKernel(cuNLMBlur,
+                                                  xblocks , yblocks, 1, /* 
blocks */
+                                                  xthreads, ythreads, 1, /* 
threads */
+                                                  0, 0, blur_args, 0));
+                       cuda_assert(cuLaunchKernel(cuNLMUpdateOutput,
+                                                  xblocks , yblocks, 1, /* 
blocks */
+                                                  xthreads, ythreads, 1, /* 
threads */
+                                                  0, 0, update_output_args, 
0));
+               }
+
+               local_rect = make_int4(0, 0, rect.z-rect.x, rect.w-rect.y);
+               void *normalize_args[] = {&out, &weightAccum, &local_rect, &w};
+               cuda_assert(cuLaunchKernel(cuNLMNormalize,
+                                          xblocks , yblocks, 1, /* blocks */
+                                          xthreads, ythreads, 1, /* threads */
+                                          0, 0, normalize_args, 0));
+       }
+
        void denoise(RenderTile &rtile, int sample)
        {
                if(have_error())
@@ -892,62 +962,38 @@ public:
                for(int frame = 0; frame < rtile.buffers->params.frames; 
frame++) {
                        CUdeviceptr d_denoise_buffer = 
CUDA_PTR_ADD(d_denoise_buffers, frame_stride*frame);
                        CUdeviceptr d_buffer = CUDA_PTR_ADD(d_buffers, 
frame*rtile.buffers->params.width*rtile.buffers->params.height*rtile.buffers->params.get_passes_size());
-                       /* ==== Step 1: Prefilter general features. ==== */
-                       {
-                               int mean_from[]      = { 0, 1, 2,  6,  7,  8, 
12 };
-                               int variance_from[]  = { 3, 4, 5,  9, 10, 11, 
13 };
-                               int offset_to[]      = { 0, 2, 4, 10, 12, 14,  
6 };
-                               for(int i = 0; i < 7; i++) {
-                                       CUdeviceptr d_mean = 
CUDA_PTR_ADD(d_denoise_buffer, offset_to[i]*pass_stride);
-                                       CUdeviceptr d_variance = 
CUDA_PTR_ADD(d_denoise_buffer, (offset_to[i]+1)*pass_stride);
-                                       CUdeviceptr d_unfiltered = 
CUDA_PTR_ADD(d_denoise_buffer, 16*pass_stride);
-
-                                       void *get_feature_args[] = {&sample, 
&d_buffer, &mean_from[i], &variance_from[i],
-                                                                   
&buffer_area,
-                                                                   
&rtile.offset, &rtile.stride,
-                                                                   
&d_unfiltered, &d_variance,
-                                                                   &rect};
-                                       
cuda_assert(cuLaunchKernel(cuFilterGetFeature,
-                                                                  xblocks , 
yblocks, 1, /* blocks */
-                                                                  xthreads, 
ythreads, 1, /* threads */
-                                                                  0, 0, 
get_feature_args, 0));
-
-                                       /* Smooth the (generally pretty noisy) 
buffer variance using the spatial information from the sample variance. */
-                                       float a = 1.0f, k_2 = 0.25f;
-                                       int r = 4, f = 2;
-                                       void *filter_feature_args[] = 
{&d_unfiltered, &d_unfiltered, &d_variance, &d_mean,
-                                                                      &rect,
-                                                                      &r, &f, 
&a, &k_2};
-                                       
cuda_assert(cuLaunchKernel(cuFilterNonLocalMeans,
-                                                                  xblocks , 
yblocks, 1, /* blocks */
-                                                                  xthreads, 
ythreads, 1, /* threads */
-                                                                  0, 0, 
filter_feature_args, 0));
-                               }
-                       }
 
-                       /* 

@@ Diff output truncated at 10240 characters. @@

_______________________________________________
Bf-blender-cvs mailing list
[email protected]
https://lists.blender.org/mailman/listinfo/bf-blender-cvs

[Bf-blender-cvs] [466bd61857] soc-2016-cycles_denoising: Cycles: Implement new NLM kernels for CUDA

Reply via email to