Commit: ca6d583008ffb5f0d23ab66a324f5ad5311da951
Author: Lukas Stockner
Date: Fri Jan 13 16:45:13 2017 +0100
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rBca6d583008ffb5f0d23ab66a324f5ad5311da951
Cycles: Implement new NLM reconstruction kernels
===================================================================
M intern/cycles/device/device_cpu.cpp
M intern/cycles/device/device_cuda.cpp
M intern/cycles/kernel/filter/filter.h
M intern/cycles/kernel/filter/filter_final_pass_impl.h
M intern/cycles/kernel/filter/filter_nlm_cpu.h
M intern/cycles/kernel/filter/filter_nlm_gpu.h
M intern/cycles/kernel/filter/filter_prefilter.h
M intern/cycles/kernel/kernel_types.h
M intern/cycles/kernel/kernels/cpu/kernel_cpu.h
M intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
M intern/cycles/kernel/kernels/cuda/kernel.cu
M intern/cycles/util/util_math_matrix.h
===================================================================
diff --git a/intern/cycles/device/device_cpu.cpp
b/intern/cycles/device/device_cpu.cpp
index dfc6995e53..ae7e24f0ef 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -146,6 +146,9 @@ public:
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*,
int, int)> filter_nlm_update_output_kernel;
KernelFunctions<void(*)(float*, float*, int*, int)>
filter_nlm_normalize_kernel;
+ KernelFunctions<void(*)(int, int, float*, float*, int, void*, float*,
float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel;
+ KernelFunctions<void(*)(int, int, int, int, int, float*, void*, float*,
float3*, int*, int)> filter_finalize_kernel;
+
#define KERNEL_FUNCTIONS(name) \
KERNEL_NAME_EVAL(cpu, name), \
KERNEL_NAME_EVAL(cpu_sse2, name), \
@@ -170,7 +173,9 @@ public:
filter_nlm_blur_kernel(KERNEL_FUNCTIONS(filter_nlm_blur)),
filter_nlm_calc_weight_kernel(KERNEL_FUNCTIONS(filter_nlm_calc_weight)),
filter_nlm_update_output_kernel(KERNEL_FUNCTIONS(filter_nlm_update_output)),
- filter_nlm_normalize_kernel(KERNEL_FUNCTIONS(filter_nlm_normalize))
+ filter_nlm_normalize_kernel(KERNEL_FUNCTIONS(filter_nlm_normalize)),
+
filter_nlm_construct_gramian_kernel(KERNEL_FUNCTIONS(filter_nlm_construct_gramian)),
+ filter_finalize_kernel(KERNEL_FUNCTIONS(filter_finalize))
{
#ifdef WITH_OSL
kernel_globals.osl = &osl_globals;
@@ -473,29 +478,57 @@ public:
bool use_gradients = kg->__data.integrator.use_gradients;
int hw = kg->__data.integrator.half_window;
- FilterStorage *storage = new
FilterStorage[filter_area.z*filter_area.w];
- float *weight_cache = new float[(2*hw+1)*(2*hw+1)];
+ int storage_num = filter_area.z*filter_area.w;
+ FilterStorage *storage = new FilterStorage[storage_num];
int w = align_up(rect.z - rect.x, 4), h = (rect.w - rect.y);
int pass_stride = w*h;
+ float *XtWX = new
float[(DENOISE_FEATURES+1)*(DENOISE_FEATURES+1)*storage_num];
+ float3 *XtWY = new float3[(DENOISE_FEATURES+1)*storage_num];
+ memset(XtWX, 0,
sizeof(float)*(DENOISE_FEATURES+1)*(DENOISE_FEATURES+1)*storage_num);
+ memset(XtWY, 0,
sizeof(float3)*(DENOISE_FEATURES+1)*storage_num);
+
for(int y = 0; y < filter_area.w; y++) {
for(int x = 0; x < filter_area.z; x++) {
filter_construct_transform_kernel()(kg, sample,
filter_buffer, x + filter_area.x, y + filter_area.y, storage + y*filter_area.z
+ x, &rect.x);
- filter_reconstruct_kernel()(kg, sample,
filter_buffer, x + filter_area.x, y + filter_area.y, offset, stride, buffers,
storage + y*filter_area.z + x, weight_cache, &filter_area.x, &rect.x);
+ //filter_reconstruct_kernel()(kg, sample,
filter_buffer, x + filter_area.x, y + filter_area.y, offset, stride, buffers,
storage + y*filter_area.z + x, weight_cache, &filter_area.x, &rect.x);
}
}
- if(use_gradients) {
+ {
+ int f = 4;
+ float a = 1.0f;
+ float k_2 = kg->__data.integrator.weighting_adjust;
+ float *weight = filter_buffer + 16*pass_stride;
+ float *variance = filter_buffer + 17*pass_stride;
+ float *difference = new float[pass_stride];
+ float *blurDifference = new float[pass_stride];
+ int local_filter_rect[4] = {filter_area.x-rect.x,
filter_area.y-rect.y, filter_area.z, filter_area.w};
+ for(int i = 0; i < (2*hw+1)*(2*hw+1); i++) {
+ int dy = i / (2*hw+1) - hw;
+ int dx = i % (2*hw+1) - hw;
+
+ int local_rect[4] = {max(0, -dx), max(0, -dy),
rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)};
+ filter_nlm_calc_difference_kernel()(dx, dy,
weight, variance, difference, local_rect, w, 2*pass_stride, a, k_2);
+ filter_nlm_blur_kernel()(difference,
blurDifference, local_rect, w, f);
+ filter_nlm_calc_weight_kernel()(blurDifference,
difference, local_rect, w, f);
+ filter_nlm_blur_kernel()(difference,
blurDifference, local_rect, w, f);
+ filter_nlm_construct_gramian_kernel()(dx, dy,
blurDifference, filter_buffer, 0*pass_stride, storage, XtWX, XtWY, local_rect,
local_filter_rect, w, h, 4);
+ }
+ delete[] difference;
+ delete[] blurDifference;
+ int buffer_params[4] = {offset, stride,
kg->__data.film.pass_stride, kg->__data.film.pass_no_denoising};
for(int y = 0; y < filter_area.w; y++) {
for(int x = 0; x < filter_area.z; x++) {
- filter_divide_combined_kernel()(kg, x +
filter_area.x, y + filter_area.y, sample, buffers, offset, stride);
+ filter_finalize_kernel()(x +
filter_area.x, y + filter_area.y, y*filter_area.z + x, w, h, buffers, storage,
XtWX, XtWY, buffer_params, sample);
}
}
}
delete[] storage;
- delete[] weight_cache;
+ delete[] XtWX;
+ delete[] XtWY;
}
void thread_render(DeviceTask& task)
diff --git a/intern/cycles/device/device_cuda.cpp
b/intern/cycles/device/device_cuda.cpp
index 5d347aea22..8ec6ca6b91 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -907,13 +907,12 @@ public:
cuda_push_context();
- CUfunction cuFilterDivideShadow, cuFilterGetFeature,
cuFilterNonLocalMeans, cuFilterCombineHalves;
+ CUfunction cuFilterDivideShadow, cuFilterGetFeature,
cuFilterCombineHalves;
CUfunction cuFilterConstructTransform, cuFilterReconstruct,
cuFilterDivideCombined;
CUdeviceptr d_buffers = cuda_device_ptr(rtile.buffer);
cuda_assert(cuModuleGetFunction(&cuFilterDivideShadow,
cuModule, "kernel_cuda_filter_divide_shadow"));
cuda_assert(cuModuleGetFunction(&cuFilterGetFeature, cuModule,
"kernel_cuda_filter_get_feature"));
- cuda_assert(cuModuleGetFunction(&cuFilterNonLocalMeans,
cuModule, "kernel_cuda_filter_non_local_means"));
cuda_assert(cuModuleGetFunction(&cuFilterCombineHalves,
cuModule, "kernel_cuda_filter_combine_halves"));
cuda_assert(cuModuleGetFunction(&cuFilterConstructTransform,
cuModule, "kernel_cuda_filter_construct_transform"));
@@ -922,7 +921,6 @@ public:
cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow,
CU_FUNC_CACHE_PREFER_L1));
cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature,
CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuFilterNonLocalMeans,
CU_FUNC_CACHE_PREFER_L1));
cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves,
CU_FUNC_CACHE_PREFER_L1));
bool l1 = false;
@@ -954,6 +952,7 @@ public:
CUdeviceptr d_denoise_buffers;
int w = align_up(rect.z - rect.x, 4);
+ int h = (rect.w - rect.y);
int frame_stride = w*(rect.w - rect.y);
int pass_stride = frame_stride*rtile.buffers->params.frames;
cuda_assert(cuMemAlloc(&d_denoise_buffers,
22*pass_stride*sizeof(float)));
@@ -1113,7 +1112,6 @@ public:
}
}
}
-#undef CUDA_PTR_ADD
#ifdef WITH_CYCLES_DEBUG_FILTER
#define WRITE_DEBUG(name, pass)
debug_write_pfm(string_printf("debug_%dx%d_cuda_feature%d_%s.pfm",
rtile.x+rtile.buffers->params.overscan, rtile.y+rtile.buffers->params.overscan,
i, name).c_str(), host_denoise_buffer+pass*pass_stride, rtile.w, rtile.h, 1, w)
@@ -1128,9 +1126,10 @@ public:
#endif
/* Use the prefiltered feature to denoise the image. */
+ int storage_num = filter_area.z*filter_area.w;
CUdeviceptr d_storage, d_transforms;
- cuda_assert(cuMemAlloc(&d_storage,
filter_area.z*filter_area.w*sizeof(CUDAFilterStorage)));
- cuda_assert(cuMemAlloc(&d_transforms,
filter_area.z*filter_area.w*sizeof(float)*DENOISE_FEATURES*DENOISE_FEATURES));
+ cuda_assert(cuMemAlloc(&d_storage,
storage_num*sizeof(CUDAFilterStorage)));
+ cuda_assert(cuMemAlloc(&d_transforms,
storage_num*sizeof(float)*DENOISE_FEATURES*DENOISE_FEATURES));
xthreads = (int)sqrt((float)threads_per_block);
ythreads = (int)sqrt((float)threads_per_block);
@@ -1148,33 +1147,85 @@ public:
xthreads, ythreads, 1, /* threads */
0, 0, transform_args, 0));
- void *final_args[] = {&sample,
- &d_denoise_buffers,
- &rtile.offset,
- &rtile.stride,
- &d_transforms,
- &d_storage,
- &d_buffers,
- &filter_area,
- &rect};
- cuda_assert(cuLaunchKernel(cuFilterReconstruct,
- xblocks , yblocks, 1, /* blocks */
- xthreads, ythreads, 1, /* threads */
- 0, 0, final_args, 0));
- cuda_assert(cuCtxSynchronize());
+ CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight,
cuNLMConstructGramian, cuFinalize;
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuModule,
"kernel_cuda_filter_nlm_calc_difference"));
+ cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuModule,
"kernel_cuda_filter_nlm_blur"));
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuModule,
"kernel_cuda_filter_nlm_calc_weight"));
+ cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian,
cuModule, "kernel_cuda_filter_nlm_construct_gramian"));
+ cuda_assert(cuModuleGetFunction(&cuFinalize, cuModule,
"kernel_cuda_filter_finalize"));
+
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference,
CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMBlur,
CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight,
CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian,
CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuFinalize,
CU_FUNC_CACHE_PREFER_L1));
+
+ xblocks = ((rect.z-rect.x) + xthreads - 1)/xthreads;
+ yblocks = ((rect.w-rect.y) + ythreads - 1)/ythreads;
+
+ int dx, dy;
+ int4 local_rect, local_filter_rect =
make_int4(filter_area.x-rect.x, filter_area.y-rect.y, filter_area.z,
filter_area.w);
+ int f = 4;
+ float a = 1.0f;
+ float k_2 = kernel_globals.integrator.weighting_adjust;
+ int color_pass = 0;
+
+ CUdeviceptr color_buffer = CUDA_PTR_ADD(d_denoise_buffers,
16*pass_stride);
+ CUdeviceptr variance_buffer = CUDA_PTR_ADD(d_denoise_buffers,
17*pass_stride);
+ CUdeviceptr d_difference, d_blurDifference, d_XtWX, d_XtWY;
+ cuda_assert(cuMemAlloc(&d_difference,
pass_stride*sizeof(float)));
+ cuda_assert(cuMemAlloc(&d_blurDifference,
pass_stride*sizeof(float)));
+ cuda_assert(cuMemAlloc(&d_XtWX, storage_num*sizeo
@@ Diff output truncated at 10240 characters. @@
_______________________________________________
Bf-blender-cvs mailing list
[email protected]
https://lists.blender.org/mailman/listinfo/bf-blender-cvs