Commit: 466bd61857f61b73b8006db8ccc2e78799f4ef30
Author: Lukas Stockner
Date: Fri Jan 13 00:09:09 2017 +0100
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rB466bd61857f61b73b8006db8ccc2e78799f4ef30
Cycles: Implement new NLM kernels for CUDA
===================================================================
M intern/cycles/device/device_cpu.cpp
M intern/cycles/device/device_cuda.cpp
M intern/cycles/kernel/CMakeLists.txt
M intern/cycles/kernel/filter/filter.h
R098 intern/cycles/kernel/filter/filter_nlm.h
intern/cycles/kernel/filter/filter_nlm_cpu.h
A intern/cycles/kernel/filter/filter_nlm_gpu.h
M intern/cycles/kernel/kernels/cuda/kernel.cu
===================================================================
diff --git a/intern/cycles/device/device_cpu.cpp
b/intern/cycles/device/device_cpu.cpp
index a596097f15..dfc6995e53 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -336,7 +336,7 @@ public:
/* Reuse some passes of the filter_buffer for
temporary storage. */
float *sampleV = PASSPTR(0), *sampleVV =
PASSPTR(1), *bufferV = PASSPTR(2), *cleanV = PASSPTR(3);
float *unfiltered = PASSPTR(4), *unfilteredB =
PASSPTR(5);
- float *diffI = PASSPTR(10), *blurDiffI =
PASSPTR(11), *accumI = PASSPTR(12);
+ float *nlm_temp1 = PASSPTR(10), *nlm_temp2 =
PASSPTR(11), *nlm_temp3 = PASSPTR(12);
/* Get the A/B unfiltered passes, the combined
sample variance, the estimated variance of the sample variance and the buffer
variance. */
for(int y = rect.y; y < rect.w; y++) {
@@ -354,14 +354,14 @@ public:
#endif
/* Smooth the (generally pretty noisy) buffer
variance using the spatial information from the sample variance. */
- non_local_means(rect, bufferV, sampleV, cleanV,
sampleVV, diffI, blurDiffI, accumI, 6, 3, 4.0f, 1.0f);
+ non_local_means(rect, bufferV, sampleV, cleanV,
sampleVV, nlm_temp1, nlm_temp2, nlm_temp3, 6, 3, 4.0f, 1.0f);
#ifdef WITH_CYCLES_DEBUG_FILTER
WRITE_DEBUG("cleanV", cleanV);
#endif
/* Use the smoothed variance to filter the two
shadow half images using each other for weight calculation. */
- non_local_means(rect, unfiltered, unfilteredB,
sampleV, cleanV, diffI, blurDiffI, accumI, 5, 3, 1.0f, 0.25f);
- non_local_means(rect, unfilteredB, unfiltered,
bufferV, cleanV, diffI, blurDiffI, accumI, 5, 3, 1.0f, 0.25f);
+ non_local_means(rect, unfiltered, unfilteredB,
sampleV, cleanV, nlm_temp1, nlm_temp2, nlm_temp3, 5, 3, 1.0f, 0.25f);
+ non_local_means(rect, unfilteredB, unfiltered,
bufferV, cleanV, nlm_temp1, nlm_temp2, nlm_temp3, 5, 3, 1.0f, 0.25f);
#ifdef WITH_CYCLES_DEBUG_FILTER
WRITE_DEBUG("filteredA", sampleV);
WRITE_DEBUG("filteredB", bufferV);
@@ -378,8 +378,8 @@ public:
#endif
/* Use the residual variance for a second
filter pass. */
- non_local_means(rect, sampleV, bufferV,
unfiltered , sampleVV, diffI, blurDiffI, accumI, 4, 2, 1.0f, 0.5f);
- non_local_means(rect, bufferV, sampleV,
unfilteredB, sampleVV, diffI, blurDiffI, accumI, 4, 2, 1.0f, 0.5f);
+ non_local_means(rect, sampleV, bufferV,
unfiltered , sampleVV, nlm_temp1, nlm_temp2, nlm_temp3, 4, 2, 1.0f, 0.5f);
+ non_local_means(rect, bufferV, sampleV,
unfilteredB, sampleVV, nlm_temp1, nlm_temp2, nlm_temp3, 4, 2, 1.0f, 0.5f);
#ifdef WITH_CYCLES_DEBUG_FILTER
WRITE_DEBUG("finalA", unfiltered);
WRITE_DEBUG("finalB", unfiltered + pass_stride);
@@ -403,7 +403,7 @@ public:
{
float *unfiltered = PASSPTR(16);
- float *diffI = PASSPTR(17), *blurDiffI =
PASSPTR(18), *accumI = PASSPTR(19);
+ float *nlm_temp1 = PASSPTR(17), *nlm_temp2 =
PASSPTR(18), *nlm_temp3 = PASSPTR(19);
/* Order in render buffers:
* Normal[X, Y, Z] NormalVar[X, Y, Z]
Albedo[R, G, B] AlbedoVar[R, G, B ] Depth DepthVar
* 0 1 2 3 4 5
6 7 8 9 10 11 12 13
@@ -423,7 +423,7 @@ public:
filter_get_feature_kernel()(kg, sample, buffer, mean_from[i], variance_from[i],
x, y, tile_x, tile_y, offsets, strides, unfiltered, PASSPTR(offset_to[i]+1),
&rect.x);
}
}
- non_local_means(rect, unfiltered,
unfiltered, PASSPTR(offset_to[i]), PASSPTR(offset_to[i]+1), diffI, blurDiffI,
accumI, 2, 2, 1, 0.25f);
+ non_local_means(rect, unfiltered,
unfiltered, PASSPTR(offset_to[i]), PASSPTR(offset_to[i]+1), nlm_temp1,
nlm_temp2, nlm_temp3, 2, 2, 1, 0.25f);
#ifdef WITH_CYCLES_DEBUG_FILTER
#define WRITE_DEBUG(name, var) debug.add_pass(string_printf("f%d_%s", i,
name), var, 1, w);
WRITE_DEBUG("unfiltered", unfiltered);
diff --git a/intern/cycles/device/device_cuda.cpp
b/intern/cycles/device/device_cuda.cpp
index 27e5b7af5b..5d347aea22 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -830,6 +830,76 @@ public:
}
}
+ void non_local_means(int4 rect, CUdeviceptr image, CUdeviceptr weight,
CUdeviceptr out, CUdeviceptr variance, CUdeviceptr difference, CUdeviceptr
blurDifference, CUdeviceptr weightAccum, int r, int f, float a, float k_2) {
+ int w = align_up(rect.z-rect.x, 4);
+ int h = rect.w-rect.y;
+
+ cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*w*h));
+ cuda_assert(cuMemsetD8(out, 0, sizeof(float)*w*h));
+
+ CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight,
cuNLMUpdateOutput, cuNLMNormalize;
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuModule,
"kernel_cuda_filter_nlm_calc_difference"));
+ cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuModule,
"kernel_cuda_filter_nlm_blur"));
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuModule,
"kernel_cuda_filter_nlm_calc_weight"));
+ cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput, cuModule,
"kernel_cuda_filter_nlm_update_output"));
+ cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuModule,
"kernel_cuda_filter_nlm_normalize"));
+
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference,
CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMBlur,
CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight,
CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput,
CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize,
CU_FUNC_CACHE_PREFER_L1));
+
+ int threads_per_block;
+ cuda_assert(cuFuncGetAttribute(&threads_per_block,
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuNLMCalcDifference));
+
+ int xthreads = (int)sqrt((float)threads_per_block);
+ int ythreads = (int)sqrt((float)threads_per_block);
+ int xblocks = ((rect.z-rect.x) + xthreads - 1)/xthreads;
+ int yblocks = ((rect.w-rect.y) + ythreads - 1)/ythreads;
+
+ int dx, dy;
+ int4 local_rect;
+ void *calc_difference_args[] = {&dx, &dy, &weight, &variance,
&difference, &local_rect, &w, &a, &k_2};
+ void *blur_args[] = {&difference, &blurDifference, &local_rect,
&w, &f};
+ void *calc_weight_args[] = {&blurDifference, &difference,
&local_rect, &w, &f};
+ void *update_output_args[] = {&dx, &dy, &blurDifference,
&image, &out, &weightAccum, &local_rect, &w, &f};
+
+ for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+ dy = i / (2*r+1) - r;
+ dx = i % (2*r+1) - r;
+ local_rect = make_int4(max(0, -dx), max(0, -dy),
rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
+
+ cuda_assert(cuLaunchKernel(cuNLMCalcDifference,
+ xblocks , yblocks, 1, /*
blocks */
+ xthreads, ythreads, 1, /*
threads */
+ 0, 0, calc_difference_args,
0));
+ cuda_assert(cuLaunchKernel(cuNLMBlur,
+ xblocks , yblocks, 1, /*
blocks */
+ xthreads, ythreads, 1, /*
threads */
+ 0, 0, blur_args, 0));
+ cuda_assert(cuLaunchKernel(cuNLMCalcWeight,
+ xblocks , yblocks, 1, /*
blocks */
+ xthreads, ythreads, 1, /*
threads */
+ 0, 0, calc_weight_args, 0));
+ cuda_assert(cuLaunchKernel(cuNLMBlur,
+ xblocks , yblocks, 1, /*
blocks */
+ xthreads, ythreads, 1, /*
threads */
+ 0, 0, blur_args, 0));
+ cuda_assert(cuLaunchKernel(cuNLMUpdateOutput,
+ xblocks , yblocks, 1, /*
blocks */
+ xthreads, ythreads, 1, /*
threads */
+ 0, 0, update_output_args,
0));
+ }
+
+ local_rect = make_int4(0, 0, rect.z-rect.x, rect.w-rect.y);
+ void *normalize_args[] = {&out, &weightAccum, &local_rect, &w};
+ cuda_assert(cuLaunchKernel(cuNLMNormalize,
+ xblocks , yblocks, 1, /* blocks */
+ xthreads, ythreads, 1, /* threads */
+ 0, 0, normalize_args, 0));
+ }
+
void denoise(RenderTile &rtile, int sample)
{
if(have_error())
@@ -892,62 +962,38 @@ public:
for(int frame = 0; frame < rtile.buffers->params.frames;
frame++) {
CUdeviceptr d_denoise_buffer =
CUDA_PTR_ADD(d_denoise_buffers, frame_stride*frame);
CUdeviceptr d_buffer = CUDA_PTR_ADD(d_buffers,
frame*rtile.buffers->params.width*rtile.buffers->params.height*rtile.buffers->params.get_passes_size());
- /* ==== Step 1: Prefilter general features. ==== */
- {
- int mean_from[] = { 0, 1, 2, 6, 7, 8,
12 };
- int variance_from[] = { 3, 4, 5, 9, 10, 11,
13 };
- int offset_to[] = { 0, 2, 4, 10, 12, 14,
6 };
- for(int i = 0; i < 7; i++) {
- CUdeviceptr d_mean =
CUDA_PTR_ADD(d_denoise_buffer, offset_to[i]*pass_stride);
- CUdeviceptr d_variance =
CUDA_PTR_ADD(d_denoise_buffer, (offset_to[i]+1)*pass_stride);
- CUdeviceptr d_unfiltered =
CUDA_PTR_ADD(d_denoise_buffer, 16*pass_stride);
-
- void *get_feature_args[] = {&sample,
&d_buffer, &mean_from[i], &variance_from[i],
-
&buffer_area,
-
&rtile.offset, &rtile.stride,
-
&d_unfiltered, &d_variance,
- &rect};
-
cuda_assert(cuLaunchKernel(cuFilterGetFeature,
- xblocks ,
yblocks, 1, /* blocks */
- xthreads,
ythreads, 1, /* threads */
- 0, 0,
get_feature_args, 0));
-
- /* Smooth the (generally pretty noisy)
buffer variance using the spatial information from the sample variance. */
- float a = 1.0f, k_2 = 0.25f;
- int r = 4, f = 2;
- void *filter_feature_args[] =
{&d_unfiltered, &d_unfiltered, &d_variance, &d_mean,
- &rect,
- &r, &f,
&a, &k_2};
-
cuda_assert(cuLaunchKernel(cuFilterNonLocalMeans,
- xblocks ,
yblocks, 1, /* blocks */
- xthreads,
ythreads, 1, /* threads */
- 0, 0,
filter_feature_args, 0));
- }
- }
- /*
@@ Diff output truncated at 10240 characters. @@
_______________________________________________
Bf-blender-cvs mailing list
[email protected]
https://lists.blender.org/mailman/listinfo/bf-blender-cvs