Commit: 057821cd5708793450c73b4dd6591f8970dbc41f
Author: Lukas Stockner
Date:   Sat Sep 3 17:09:00 2016 +0200
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rB057821cd5708793450c73b4dd6591f8970dbc41f

Cycles: Tweak the reduced feature space calculation

===================================================================

M       intern/cycles/kernel/kernel_filter.h
M       intern/cycles/kernel/kernel_filter_util.h

===================================================================

diff --git a/intern/cycles/kernel/kernel_filter.h 
b/intern/cycles/kernel/kernel_filter.h
index e842ab2..c32541a 100644
--- a/intern/cycles/kernel/kernel_filter.h
+++ b/intern/cycles/kernel/kernel_filter.h
@@ -61,13 +61,12 @@ ccl_device void 
kernel_filter_construct_transform(KernelGlobals *kg, int sample,
        math_vector_zero(feature_scale, DENOISE_FEATURES);
 
        FOR_PIXEL_WINDOW {
-               filter_get_features(px, py, pt, pixel_buffer, features, 
feature_means, pass_stride);
+               filter_get_feature_scales(px, py, pt, pixel_buffer, features, 
feature_means, pass_stride);
                for(int i = 0; i < DENOISE_FEATURES; i++)
-                       feature_scale[i] = max(feature_scale[i], 
fabsf(features[i]));
+                       feature_scale[i] = max(feature_scale[i], features[i]);
        } END_FOR_PIXEL_WINDOW
 
-       for(int i = 0; i < DENOISE_FEATURES; i++)
-               feature_scale[i] = 1.0f / max(feature_scale[i], 0.01f);
+       filter_calculate_scale(feature_scale);
 
 
 
@@ -92,6 +91,7 @@ ccl_device void 
kernel_filter_construct_transform(KernelGlobals *kg, int sample,
        int rank = svd_cuda(feature_matrix, transform, transform_stride, 
singular, DENOISE_FEATURES);
 
        float singular_threshold = 0.01f + 2.0f * (sqrtf(feature_matrix_norm) / 
(sqrtf(rank) * 0.5f));
+       singular_threshold *= singular_threshold;
 
        rank = 0;
        for(int i = 0; i < DENOISE_FEATURES; i++, rank++) {
@@ -462,13 +462,12 @@ ccl_device void 
kernel_filter_estimate_params(KernelGlobals *kg, int sample, flo
 
        __m128 feature_scale[DENOISE_FEATURES] = {_mm_setzero_ps()};
        FOR_PIXEL_WINDOW_SSE {
-               filter_get_features_sse(x4, y4, t4, active_pixels, 
pixel_buffer, features, feature_means, pass_stride);
+               filter_get_feature_scales_sse(x4, y4, t4, active_pixels, 
pixel_buffer, features, feature_means, pass_stride);
                for(int i = 0; i < DENOISE_FEATURES; i++)
-                       feature_scale[i] = _mm_max_ps(feature_scale[i], 
_mm_fabs_ps(features[i]));
+                       feature_scale[i] = _mm_max_ps(feature_scale[i], 
features[i]);
        } END_FOR_PIXEL_WINDOW_SSE
 
-       for(int i = 0; i < DENOISE_FEATURES; i++)
-               feature_scale[i] = 
_mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(feature_scale[i]), _mm_set1_ps(0.01f)));
+       filter_calculate_scale_sse(feature_scale);
 
        __m128 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
        __m128 feature_matrix_norm = _mm_setzero_ps();
@@ -493,6 +492,7 @@ ccl_device void kernel_filter_estimate_params(KernelGlobals 
*kg, int sample, flo
        __m128 feature_transform_sse[DENOISE_FEATURES*DENOISE_FEATURES];
        int rank = svd(feature_matrix, feature_transform, singular, 
DENOISE_FEATURES);
        float singular_threshold = 0.01f + 2.0f * 
(sqrtf(_mm_hsum_ss(feature_matrix_norm)) / (sqrtf(rank) * 0.5f));
+       singular_threshold *= singular_threshold;
 
        rank = 0;
        for(int i = 0; i < DENOISE_FEATURES; i++, rank++) {
@@ -740,13 +740,12 @@ ccl_device void 
kernel_filter_estimate_params(KernelGlobals *kg, int sample, flo
        math_vector_zero(feature_scale, DENOISE_FEATURES);
 
        FOR_PIXEL_WINDOW {
-               filter_get_features(px, py, pt, pixel_buffer, features, 
feature_means, pass_stride);
+               filter_get_feature_scales(px, py, pt, pixel_buffer, features, 
feature_means, pass_stride);
                for(int i = 0; i < DENOISE_FEATURES; i++)
-                       feature_scale[i] = max(feature_scale[i], 
fabsf(features[i]));
+                       feature_scale[i] = max(feature_scale[i], features[i]);
        } END_FOR_PIXEL_WINDOW
 
-       for(int i = 0; i < DENOISE_FEATURES; i++)
-               feature_scale[i] = 1.0f / max(feature_scale[i], 0.01f);
+       filter_calculate_scale(feature_scale);
 
 
 
@@ -786,6 +785,7 @@ ccl_device void kernel_filter_estimate_params(KernelGlobals 
*kg, int sample, flo
        float singular_threshold = 0.01f + 2.0f * 
sqrtf(math_largest_eigenvalue(perturbation_matrix, NORM_FEATURE_NUM, 
tempvector_2, tempvector_2 + DENOISE_FEATURES));
 #else
        float singular_threshold = 0.01f + 2.0f * (sqrtf(feature_matrix_norm) / 
(sqrtf(rank) * 0.5f));
+       singular_threshold *= singular_threshold;
 #endif
 
        rank = 0;
diff --git a/intern/cycles/kernel/kernel_filter_util.h 
b/intern/cycles/kernel/kernel_filter_util.h
index e1d3afb..5981358 100644
--- a/intern/cycles/kernel/kernel_filter_util.h
+++ b/intern/cycles/kernel/kernel_filter_util.h
@@ -52,10 +52,10 @@ ccl_device_inline void filter_get_features(int x, int y, 
int t, float ccl_readon
 #ifdef DENOISE_TEMPORAL
        *(feature++) = t;
 #endif
+       *(feature++) = ccl_get_feature(6);
        *(feature++) = ccl_get_feature(0);
        *(feature++) = ccl_get_feature(2);
        *(feature++) = ccl_get_feature(4);
-       *(feature++) = ccl_get_feature(6);
        *(feature++) = ccl_get_feature(8);
        *(feature++) = ccl_get_feature(10);
        *(feature++) = ccl_get_feature(12);
@@ -79,11 +79,11 @@ ccl_device_inline void filter_get_feature_variance(int x, 
int y, float ccl_reado
 #ifdef DENOISE_TEMPORAL
        *(feature++) = 0.0f;
 #endif
+       *(feature++) = ccl_get_feature(7);
        *(feature++) = ccl_get_feature(1);
        *(feature++) = ccl_get_feature(3);
        *(feature++) = ccl_get_feature(5);
-       *(feature++) = ccl_get_feature(7);
-       *(feature++) = ccl_get_feature(9);
+       *(feature++) = 0.0f;//ccl_get_feature(9);
        *(feature++) = ccl_get_feature(11);
        *(feature++) = ccl_get_feature(13);
        *(feature++) = ccl_get_feature(15);
@@ -96,6 +96,54 @@ ccl_device_inline void filter_get_feature_variance(int x, 
int y, float ccl_reado
                features[i] *= scale[i]*scale[i];
 }
 
+ccl_device_inline void filter_get_feature_scales(int x, int y, int t, float 
ccl_readonly_ptr buffer, float *scales, float *mean, int pass_stride)
+{
+       *(scales++) = fabsf(x - *(mean++)); //X
+       *(scales++) = fabsf(y - *(mean++)); //Y
+#ifdef DENOISE_TEMPORAL
+       *(scales++) = fabsf(t - *(mean++)); //T
+#endif
+
+       *(scales++) = fabsf(ccl_get_feature(6) - *(mean++)); //Depth
+
+       float normalS = len_squared(make_float3(ccl_get_feature(0) - mean[0], 
ccl_get_feature(2) - mean[1], ccl_get_feature(4) - mean[2]));
+       mean += 3;
+       *(scales++) = normalS; //NormalX
+       *(scales++) = normalS; //NormalY
+       *(scales++) = normalS; //NormalZ
+
+       *(scales++) = fabsf(ccl_get_feature(8) - *(mean++)); //Shadow
+
+       float normalT = len_squared(make_float3(ccl_get_feature(10) - mean[0], 
ccl_get_feature(12) - mean[1], ccl_get_feature(14) - mean[2]));
+       mean += 3;
+       *(scales++) = normalT; //AlbedoR
+       *(scales++) = normalT; //AlbedoG
+       *(scales++) = normalT; //AlbedoB
+}
+
+ccl_device_inline void filter_calculate_scale(float *scale)
+{
+       scale[0] = 1.0f/max(scale[0], 0.01f); //X
+       scale[1] = 1.0f/max(scale[1], 0.01f); //Y
+       scale += 2;
+#ifdef DENOISE_TEMPORAL
+       scale[0] = 1.0f/max(scale[0], 0.01f); //T
+       scale++;
+#endif
+       
+       scale[0] = 1.0f/max(scale[0], 0.01f); //Depth
+
+       scale[1] = 1.0f/max(sqrtf(scale[1]), 0.01f); //NormalX
+       scale[2] = 1.0f/max(sqrtf(scale[2]), 0.01f); //NormalY
+       scale[3] = 1.0f/max(sqrtf(scale[3]), 0.01f); //NormalZ
+       
+       scale[4] = 1.0f/max(scale[4], 0.01f); //Shadow
+
+       scale[5] = 1.0f/max(sqrtf(scale[5]), 0.01f); //AlbedoR
+       scale[6] = 1.0f/max(sqrtf(scale[6]), 0.01f); //AlbedoG
+       scale[7] = 1.0f/max(sqrtf(scale[7]), 0.01f); //AlbedoB
+}
+
 ccl_device_inline float3 filter_get_pixel_color(float ccl_readonly_ptr buffer, 
int pass_stride)
 {
        return make_float3(ccl_get_feature(16), ccl_get_feature(18), 
ccl_get_feature(20));
@@ -182,10 +230,10 @@ ccl_device_inline void filter_get_features_sse(__m128 x, 
__m128 y, __m128 t, __m
 #ifdef DENOISE_TEMPORAL
        *(feature++) = t;
 #endif
+       *(feature++) = ccl_get_feature_sse(6);
        *(feature++) = ccl_get_feature_sse(0);
        *(feature++) = ccl_get_feature_sse(2);
        *(feature++) = ccl_get_feature_sse(4);
-       *(feature++) = ccl_get_feature_sse(6);
        *(feature++) = ccl_get_feature_sse(8);
        *(feature++) = ccl_get_feature_sse(10);
        *(feature++) = ccl_get_feature_sse(12);
@@ -205,7 +253,65 @@ ccl_device_inline void filter_get_features_sse(__m128 x, 
__m128 y, __m128 t, __m
 #endif
 }
 
-ccl_device_inline void filter_get_feature_variance_sse(__m128 x, __m128 y, 
__m128 active_pixels, float ccl_readonly_ptr buffer, __m128*features, __m128 
*scale, int pass_stride)
+ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y, 
__m128 t, __m128 active_pixels, float ccl_readonly_ptr buffer, __m128 *scales, 
__m128 *mean, int pass_stride)
+{
+       *(scales++) = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, *(mean++))), 
active_pixels); //X
+       *(scales++) = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, *(mean++))), 
active_pixels); //Y
+#ifdef DENOISE_TEMPORAL
+       *(scales++) = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(t, *(mean++))), 
active_pixels); //T
+#endif
+
+       *(scales++) = 
_mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(6), *(mean++))), 
active_pixels); //Depth
+
+       __m128 diff = _mm_sub_ps(ccl_get_feature_sse(0), mean[0]);
+       __m128 scale3 = _mm_mul_ps(diff, diff);
+       diff = _mm_sub_ps(ccl_get_feature_sse(2), mean[1]);
+       scale3 = _mm_add_ps(scale3, _mm_mul_ps(diff, diff));
+       diff = _mm_sub_ps(ccl_get_feature_sse(4), mean[2]);
+       scale3 = _mm_add_ps(scale3, _mm_mul_ps(diff, diff));
+       mean += 3;
+       *(scales++) = _mm_mask_ps(scale3, active_pixels); //NormalX
+       *(scales++) = _mm_mask_ps(scale3, active_pixels); //NormalY
+       *(scales++) = _mm_mask_ps(scale3, active_pixels); //NormalZ
+
+       *(scales++) = 
_mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(8), *(mean++))), 
active_pixels); //Shadow
+
+       diff = _mm_sub_ps(ccl_get_feature_sse(10), mean[0]);
+       scale3 = _mm_mul_ps(diff, diff);
+       diff = _mm_sub_ps(ccl_get_feature_sse(12), mean[1]);
+       scale3 = _mm_add_ps(scale3, _mm_mul_ps(diff, diff));
+       diff = _mm_sub_ps(ccl_get_feature_sse(14), mean[2]);
+       scale3 = _mm_add_ps(scale3, _mm_mul_ps(diff, diff));
+       mean += 3;
+       *(scales++) = _mm_mask_ps(scale3, active_pixels); //AlbedoR
+       *(scales++) = _mm_mask_ps(scale3, active_pixels); //AlbedoG
+       *(scales++) = _mm_mask_ps(scale3, active_pixels); //AlbedoB
+}
+
+ccl_device_inline void filter_calculate_scale_sse(__m128 *scale)
+{
+       scale[0] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[0]), 
_mm_set1_ps(0.01f))); //X
+       scale[1] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[1]), 
_mm_set1_ps(0.01f))); //Y
+       scale += 2;
+#ifdef DENOISE_TEMPORAL
+       scale[0] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[0]), 
_mm_set1_ps(0.01f))); //T
+       scale++;
+#endif
+       
+       scale[0] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[0]), 
_mm_set1_ps(0.01f))); //Depth
+
+       scale[1] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[1])), 
_mm_set1_ps(0.01f))); //NormalX
+       scale[2] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[2]))

@@ Diff output truncated at 10240 characters. @@

_______________________________________________
Bf-blender-cvs mailing list
[email protected]
https://lists.blender.org/mailman/listinfo/bf-blender-cvs

Reply via email to