Commit: 0ffcfcd8a8f8bcbba7454018b0e7345319e62081
Author: Lukas Stockner
Date:   Mon Nov 14 17:27:35 2016 +0100
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rB0ffcfcd8a8f8bcbba7454018b0e7345319e62081

Cycles: Implement SSE4.1-vectorized NLM weights

===================================================================

M       intern/cycles/kernel/kernel_filter_pre.h
M       intern/cycles/kernel/kernel_filter_util.h

===================================================================

diff --git a/intern/cycles/kernel/kernel_filter_pre.h 
b/intern/cycles/kernel/kernel_filter_pre.h
index 2239f17..3f8b5b5 100644
--- a/intern/cycles/kernel/kernel_filter_pre.h
+++ b/intern/cycles/kernel/kernel_filter_pre.h
@@ -192,8 +192,33 @@ ccl_device float nlm_weight(int px, int py, int qx, int 
qy, float ccl_readonly_p
        int2 low_dPatch = make_int2(max(max(rect.x - qx, rect.x - px),  -f), 
max(max(rect.y - qy, rect.y - py),  -f));
        int2 high_dPatch = make_int2(min(min(rect.z - qx, rect.z - px), f+1), 
min(min(rect.w - qy, rect.w - py), f+1));
 
-       float dI = 0.0f;
        int dIdx = low_dPatch.x + low_dPatch.y*w;
+#ifdef __KERNEL_SSE41__
+       __m128 a_sse = _mm_set1_ps(a), k_2_sse = _mm_set1_ps(k_2);
+       __m128 dI_sse = _mm_setzero_ps();
+       __m128 highX_sse = _mm_set1_ps(high_dPatch.x);
+       for(int dy = low_dPatch.y; dy < high_dPatch.y; dy++) {
+               int dx;
+               for(dx = low_dPatch.x; dx < high_dPatch.x; dx+=4, dIdx+=4) {
+                       __m128 active = 
_mm_cmplt_ps(_mm_add_ps(_mm_set1_ps(dx), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)), 
highX_sse);
+                       __m128 p_color[3], q_color[3], p_var[3], q_var[3];
+                       filter_get_pixel_color_sse(p_buffer + dIdx, active, 
p_color, pass_stride);
+                       filter_get_pixel_color_sse(q_buffer + dIdx, active, 
q_color, pass_stride);
+                       filter_get_pixel_variance_3_sse(p_buffer + dIdx, 
active, p_var, pass_stride);
+                       filter_get_pixel_variance_3_sse(q_buffer + dIdx, 
active, q_var, pass_stride);
+
+                       __m128 diff = _mm_sub_ps(p_color[0], q_color[0]);
+                       dI_sse = _mm_add_ps(dI_sse, 
_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(diff, diff), _mm_mul_ps(a_sse, 
_mm_add_ps(p_var[0], _mm_min_ps(p_var[0], q_var[0])))), 
_mm_rcp_ps(_mm_add_ps(_mm_set1_ps(1e-7f), _mm_mul_ps(k_2_sse, 
_mm_add_ps(p_var[0], q_var[0]))))));
+                       diff = _mm_sub_ps(p_color[1], q_color[1]);
+                       dI_sse = _mm_add_ps(dI_sse, 
_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(diff, diff), _mm_mul_ps(a_sse, 
_mm_add_ps(p_var[1], _mm_min_ps(p_var[1], q_var[1])))), 
_mm_rcp_ps(_mm_add_ps(_mm_set1_ps(1e-7f), _mm_mul_ps(k_2_sse, 
_mm_add_ps(p_var[1], q_var[1]))))));
+                       diff = _mm_sub_ps(p_color[2], q_color[2]);
+                       dI_sse = _mm_add_ps(dI_sse, 
_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(diff, diff), _mm_mul_ps(a_sse, 
_mm_add_ps(p_var[2], _mm_min_ps(p_var[2], q_var[2])))), 
_mm_rcp_ps(_mm_add_ps(_mm_set1_ps(1e-7f), _mm_mul_ps(k_2_sse, 
_mm_add_ps(p_var[2], q_var[2]))))));
+               }
+               dIdx += w-(dx - low_dPatch.x);
+       }
+       float dI = _mm_hsum_ss(dI_sse);
+#else
+       float dI = 0.0f;
        for(int dy = low_dPatch.y; dy < high_dPatch.y; dy++) {
                for(int dx = low_dPatch.x; dx < high_dPatch.x; dx++, dIdx++) {
                        float3 diff = filter_get_pixel_color(p_buffer + dIdx, 
pass_stride) - filter_get_pixel_color(q_buffer + dIdx, pass_stride);
@@ -204,6 +229,7 @@ ccl_device float nlm_weight(int px, int py, int qx, int qy, 
float ccl_readonly_p
                }
                dIdx += w-(high_dPatch.x - low_dPatch.x);
        }
+#endif
        dI *= 1.0f / (3.0f * (high_dPatch.x - low_dPatch.x) * (high_dPatch.y - 
low_dPatch.y));
 
        return fast_expf(-max(0.0f, dI));
diff --git a/intern/cycles/kernel/kernel_filter_util.h 
b/intern/cycles/kernel/kernel_filter_util.h
index cd57993..89e8671 100644
--- a/intern/cycles/kernel/kernel_filter_util.h
+++ b/intern/cycles/kernel/kernel_filter_util.h
@@ -381,6 +381,13 @@ ccl_device_inline void filter_get_pixel_color_sse(float 
ccl_readonly_ptr buffer,
        color[2] = _mm_mask_ps(ccl_get_feature_sse(20), active_pixels);
 }
 
+ccl_device_inline void filter_get_pixel_variance_3_sse(float ccl_readonly_ptr 
buffer, __m128 active_pixels, __m128 *var, int pass_stride)
+{
+       var[0] = _mm_mask_ps(ccl_get_feature_sse(17), active_pixels);
+       var[1] = _mm_mask_ps(ccl_get_feature_sse(19), active_pixels);
+       var[2] = _mm_mask_ps(ccl_get_feature_sse(21), active_pixels);
+}
+
 ccl_device_inline __m128 filter_get_pixel_variance_sse(float ccl_readonly_ptr 
buffer, __m128 active_pixels, int pass_stride)
 {
        return _mm_mask_ps(_mm_mul_ps(_mm_set1_ps(1.0f/3.0f), 
_mm_add_ps(_mm_add_ps(ccl_get_feature_sse(17), ccl_get_feature_sse(19)), 
ccl_get_feature_sse(21))), active_pixels);

_______________________________________________
Bf-blender-cvs mailing list
[email protected]
https://lists.blender.org/mailman/listinfo/bf-blender-cvs

Reply via email to