Commit: ab32a1807dd153723d26a7d53895ed071233dafc
Author: Sv. Lockal
Date:   Thu Apr 3 23:34:53 2014 +0400
https://developer.blender.org/rBab32a1807dd153723d26a7d53895ed071233dafc

Cycles: SSE optimization for Voronoi cells texture

Gives 5-6% speedup for Caterpillar_PatazStudio.blend.

Reviewed By: brecht, dingto

Differential Revision: https://developer.blender.org/D419

===================================================================

M       intern/cycles/kernel/svm/svm_noise.h
M       intern/cycles/kernel/svm/svm_texture.h
M       intern/cycles/kernel/svm/svm_voronoi.h
M       intern/cycles/util/util_math.h
M       intern/cycles/util/util_simd.h

===================================================================

diff --git a/intern/cycles/kernel/svm/svm_noise.h 
b/intern/cycles/kernel/svm/svm_noise.h
index 282ad19..91dda89 100644
--- a/intern/cycles/kernel/svm/svm_noise.h
+++ b/intern/cycles/kernel/svm/svm_noise.h
@@ -357,15 +357,13 @@ ccl_device float3 cellnoise_color(float3 p)
        return make_float3(r, g, b);
 }
 #else
-ccl_device float3 cellnoise_color(const float3& p)
+ccl_device __m128 cellnoise_color(const __m128& p)
 {
-       __m128i v_yxz = quick_floor_sse(_mm_setr_ps(p.y, p.x, p.z, 0.0f));
-       __m128i v_xyy = shuffle<1, 0, 0, 3>(v_yxz);
-       __m128i v_zzx = shuffle<2, 2, 1, 3>(v_yxz);
-       __m128 rgb = bits_to_01_sse(hash_sse(v_xyy, v_yxz, v_zzx));
-
-       float3 result = *(float3*)&rgb;
-       return result;
+       __m128i ip = quick_floor_sse(p);
+       __m128i ip_yxz = shuffle<1, 0, 2, 3>(ip);
+       __m128i ip_xyy = shuffle<0, 1, 1, 3>(ip);
+       __m128i ip_zzx = shuffle<2, 2, 0, 3>(ip);
+       return bits_to_01_sse(hash_sse(ip_xyy, ip_yxz, ip_zzx));
 }
 #endif
 
diff --git a/intern/cycles/kernel/svm/svm_texture.h 
b/intern/cycles/kernel/svm/svm_texture.h
index 8ced839..5fd9204 100644
--- a/intern/cycles/kernel/svm/svm_texture.h
+++ b/intern/cycles/kernel/svm/svm_texture.h
@@ -18,6 +18,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Voronoi Distances */
 
+#if 0
 ccl_device float voronoi_distance(NodeDistanceMetric distance_metric, float3 
d, float e)
 {
 #if 0
@@ -43,8 +44,7 @@ ccl_device float voronoi_distance(NodeDistanceMetric 
distance_metric, float3 d,
 }
 
 /* Voronoi / Worley like */
-
-ccl_device_noinline float4 voronoi_Fn(float3 p, float e, int n1, int n2)
+ccl_device_inline float4 voronoi_Fn(float3 p, float e, int n1, int n2)
 {
        float da[4];
        float3 pa[4];
@@ -119,7 +119,95 @@ ccl_device_noinline float4 voronoi_Fn(float3 p, float e, 
int n1, int n2)
 
        return result;
 }
+#endif
+
+ccl_device float voronoi_F1_distance(float3 p)
+{
+       /* returns squared distance in da */
+       float da = 1e10f;
+
+#ifndef __KERNEL_SSE2__
+       int ix = floor_to_int(p.x), iy = floor_to_int(p.y), iz = 
floor_to_int(p.z);
+
+       for (int xx = -1; xx <= 1; xx++) {
+               for (int yy = -1; yy <= 1; yy++) {
+                       for (int zz = -1; zz <= 1; zz++) {
+                               float3 ip = make_float3(ix + xx, iy + yy, iz + 
zz);
+                               float3 vp = ip + cellnoise_color(ip);
+                               float d = len_squared(p - vp);
+                               da = min(d, da);
+                       }
+               }
+       }
+#else
+       __m128 vec_p = load_m128(p);
+       __m128i xyzi = quick_floor_sse(vec_p);
+
+       for (int xx = -1; xx <= 1; xx++) {
+               for (int yy = -1; yy <= 1; yy++) {
+                       for (int zz = -1; zz <= 1; zz++) {
+                               __m128 ip = _mm_cvtepi32_ps(_mm_add_epi32(xyzi, 
_mm_setr_epi32(xx, yy, zz, 0)));
+                               __m128 vp = _mm_add_ps(ip, cellnoise_color(ip));
+                               float d = len_squared<1, 1, 1, 
0>(_mm_sub_ps(vec_p, vp));
+                               da = min(d, da);
+                       }
+               }
+       }
+#endif
+
+       return da;
+}
+
+ccl_device float3 voronoi_F1_color(float3 p)
+{
+       /* returns color of the nearest point */
+       float da = 1e10f;
+
+#ifndef __KERNEL_SSE2__
+       float3 pa;
+       int ix = floor_to_int(p.x), iy = floor_to_int(p.y), iz = 
floor_to_int(p.z);
+
+       for (int xx = -1; xx <= 1; xx++) {
+               for (int yy = -1; yy <= 1; yy++) {
+                       for (int zz = -1; zz <= 1; zz++) {
+                               float3 ip = make_float3(ix + xx, iy + yy, iz + 
zz);
+                               float3 vp = ip + cellnoise_color(ip);
+                               float d = len_squared(p - vp);
+
+                               if(d < da) {
+                                       da = d;
+                                       pa = vp;
+                               }
+                       }
+               }
+       }
+
+       return cellnoise_color(pa);
+#else
+       __m128 pa, vec_p = load_m128(p);
+       __m128i xyzi = quick_floor_sse(vec_p);
+
+       for (int xx = -1; xx <= 1; xx++) {
+               for (int yy = -1; yy <= 1; yy++) {
+                       for (int zz = -1; zz <= 1; zz++) {
+                               __m128 ip = _mm_cvtepi32_ps(_mm_add_epi32(xyzi, 
_mm_setr_epi32(xx, yy, zz, 0)));
+                               __m128 vp = _mm_add_ps(ip, cellnoise_color(ip));
+                               float d = len_squared<1, 1, 1, 
0>(_mm_sub_ps(vec_p, vp));
+
+                               if(d < da) {
+                                       da = d;
+                                       pa = vp;
+                               }
+                       }
+               }
+       }
+
+       __m128 color = cellnoise_color(pa);
+       return (float3 &)color;
+#endif
+}
 
+#if 0
 ccl_device float voronoi_F1(float3 p) { return voronoi_Fn(p, 0.0f, 0, -1).w; }
 ccl_device float voronoi_F2(float3 p) { return voronoi_Fn(p, 0.0f, 1, -1).w; }
 ccl_device float voronoi_F3(float3 p) { return voronoi_Fn(p, 0.0f, 2, -1).w; }
@@ -139,6 +227,7 @@ ccl_device float voronoi_F3S(float3 p) { return 
2.0f*voronoi_F3(p) - 1.0f; }
 ccl_device float voronoi_F4S(float3 p) { return 2.0f*voronoi_F4(p) - 1.0f; }
 ccl_device float voronoi_F1F2S(float3 p) { return 2.0f*voronoi_F1F2(p) - 1.0f; 
}
 ccl_device float voronoi_CrS(float3 p) { return 2.0f*voronoi_Cr(p) - 1.0f; }
+#endif
 
 /* Noise Bases */
 
diff --git a/intern/cycles/kernel/svm/svm_voronoi.h 
b/intern/cycles/kernel/svm/svm_voronoi.h
index 7f597dc..083a2f3 100644
--- a/intern/cycles/kernel/svm/svm_voronoi.h
+++ b/intern/cycles/kernel/svm/svm_voronoi.h
@@ -20,23 +20,16 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device_noinline float4 svm_voronoi(NodeVoronoiColoring coloring, float3 p)
 {
-       /* compute distance and point coordinate of 4 nearest neighbours */
-       float4 dpa0 = voronoi_Fn(p, 1.0f, 0, -1);
-
-       /* output */
-       float fac;
-       float3 color;
-
        if(coloring == NODE_VORONOI_INTENSITY) {
-               fac = fabsf(dpa0.w);
-               color = make_float3(fac, fac, fac);
+               /* compute squared distance to the nearest neighbour */
+               float fac = voronoi_F1_distance(p);
+               return make_float4(fac, fac, fac, fac);
        }
        else {
-               color = cellnoise_color(float4_to_float3(dpa0));
-               fac = average(color);
+               /* compute color of the nearest neighbour */
+               float3 color = voronoi_F1_color(p);
+               return make_float4(color.x, color.y, color.z, average(color));
        }
-
-       return make_float4(color.x, color.y, color.z, fac);
 }
 
 ccl_device void svm_node_tex_voronoi(KernelGlobals *kg, ShaderData *sd, float 
*stack, uint4 node, int *offset)
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index b57aa26..53ed681 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -163,11 +163,7 @@ ccl_device_inline float clamp(float a, float mn, float mx)
 
 ccl_device_inline int float_to_int(float f)
 {
-#if defined(__KERNEL_SSE2__) && !defined(_MSC_VER)
-       return _mm_cvtt_ss2si(_mm_load_ss(&f));
-#else
        return (int)f;
-#endif
 }
 
 ccl_device_inline int floor_to_int(float f)
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index 486816c..679556e 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -220,6 +220,18 @@ ccl_device_inline const __m128 dot3_splat(const __m128& a, 
const __m128& b)
 #endif
 }
 
+/* squared length taking only specified axes into account */
+template<size_t X, size_t Y, size_t Z, size_t W>
+ccl_device_inline float len_squared(const __m128& a)
+{
+#ifndef __KERNEL_SSE41__
+       float4& t = (float4 &)a;
+       return (X ? t.x * t.x : 0.0f) + (Y ? t.y * t.y : 0.0f) + (Z ? t.z * t.z 
: 0.0f) + (W ? t.w * t.w : 0.0f);
+#else
+       return _mm_cvtss_f32(_mm_dp_ps(a, a, (X << 4) | (Y << 5) | (Z << 6) | 
(W << 7) | 0xf));
+#endif
+}
+
 ccl_device_inline float dot3(const __m128& a, const __m128& b)
 {
 #ifdef __KERNEL_SSE41__

_______________________________________________
Bf-blender-cvs mailing list
[email protected]
http://lists.blender.org/mailman/listinfo/bf-blender-cvs

Reply via email to