[Bf-blender-cvs] [da3fdf0] master: Code Cleanup: in Cycles SSE replace macros with templates, skip unused code with preprocessor, simplify casts

Sv. Lockal Sat, 11 Jan 2014 10:30:11 -0800

Commit: da3fdf0b4bcded2a9473279078602ac24369472a
Author: Sv. Lockal
Date:   Sat Jan 11 22:20:03 2014 +0400
https://developer.blender.org/rBda3fdf0b4bcded2a9473279078602ac24369472a


Code Cleanup: in Cycles SSE replace macros with templates, skip unused code 
with preprocessor, simplify casts

===================================================================

M       intern/cycles/kernel/kernel_bvh_subsurface.h
M       intern/cycles/kernel/kernel_bvh_traversal.h
M       intern/cycles/kernel/svm/svm_noise.h
M       intern/cycles/util/util_simd.h

===================================================================

diff --git a/intern/cycles/kernel/kernel_bvh_subsurface.h 
b/intern/cycles/kernel/kernel_bvh_subsurface.h
index fb41bdc..bb51986 100644
--- a/intern/cycles/kernel/kernel_bvh_subsurface.h
+++ b/intern/cycles/kernel/kernel_bvh_subsurface.h
@@ -64,16 +64,16 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const 
Ray *ray, Intersectio
        const shuffle_swap_t shuf_identity = shuffle_swap_identity();
        const shuffle_swap_t shuf_swap = shuffle_swap_swap();
        
-       const __m128i pn = _mm_set_epi32(0x80000000, 0x80000000, 0x00000000, 
0x00000000);
+       const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 
0x80000000, 0, 0));
        __m128 Psplat[3], idirsplat[3];
 
        Psplat[0] = _mm_set_ps1(P.x);
        Psplat[1] = _mm_set_ps1(P.y);
        Psplat[2] = _mm_set_ps1(P.z);
 
-       idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), _mm_castsi128_ps(pn));
-       idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), _mm_castsi128_ps(pn));
-       idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), _mm_castsi128_ps(pn));
+       idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), pn);
+       idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), pn);
+       idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), pn);
 
        __m128 tsplat = _mm_set_ps(-tmax, -tmax, 0.0f, 0.0f);
 
@@ -143,8 +143,8 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const 
Ray *ray, Intersectio
                                const __m128 tminmaxy = 
_mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shuffley), Psplat[1]), 
idirsplat[1]);
                                const __m128 tminmaxz = 
_mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflez), Psplat[2]), 
idirsplat[2]);
 
-                               const __m128 tminmax = 
_mm_xor_ps(_mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, 
tsplat)), _mm_castsi128_ps(pn));
-                               const __m128 lrhit = _mm_cmple_ps(tminmax, 
shuffle_swap(tminmax, shuf_swap));
+                               const __m128 tminmax = 
_mm_xor_ps(_mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, 
tsplat)), pn);
+                               const __m128 lrhit = _mm_cmple_ps(tminmax, 
shuffle<2, 3, 0, 1>(tminmax));
 
                                /* decide which nodes to traverse next */
 #ifdef __VISIBILITY_FLAG__
@@ -242,9 +242,9 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const 
Ray *ray, Intersectio
                                                Psplat[1] = _mm_set_ps1(P.y);
                                                Psplat[2] = _mm_set_ps1(P.z);
 
-                                               idirsplat[0] = 
_mm_xor_ps(_mm_set_ps1(idir.x), _mm_castsi128_ps(pn));
-                                               idirsplat[1] = 
_mm_xor_ps(_mm_set_ps1(idir.y), _mm_castsi128_ps(pn));
-                                               idirsplat[2] = 
_mm_xor_ps(_mm_set_ps1(idir.z), _mm_castsi128_ps(pn));
+                                               idirsplat[0] = 
_mm_xor_ps(_mm_set_ps1(idir.x), pn);
+                                               idirsplat[1] = 
_mm_xor_ps(_mm_set_ps1(idir.y), pn);
+                                               idirsplat[2] = 
_mm_xor_ps(_mm_set_ps1(idir.z), pn);
 
                                                tsplat = _mm_set_ps(-tmax, 
-tmax, 0.0f, 0.0f);
 
@@ -285,9 +285,9 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const 
Ray *ray, Intersectio
                        Psplat[1] = _mm_set_ps1(P.y);
                        Psplat[2] = _mm_set_ps1(P.z);
 
-                       idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), 
_mm_castsi128_ps(pn));
-                       idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), 
_mm_castsi128_ps(pn));
-                       idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), 
_mm_castsi128_ps(pn));
+                       idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), pn);
+                       idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), pn);
+                       idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), pn);
 
                        tsplat = _mm_set_ps(-tmax, -tmax, 0.0f, 0.0f);
 
diff --git a/intern/cycles/kernel/kernel_bvh_traversal.h 
b/intern/cycles/kernel/kernel_bvh_traversal.h
index a78e72e..24f5568 100644
--- a/intern/cycles/kernel/kernel_bvh_traversal.h
+++ b/intern/cycles/kernel/kernel_bvh_traversal.h
@@ -73,16 +73,16 @@ ccl_device bool BVH_FUNCTION_NAME
        const shuffle_swap_t shuf_identity = shuffle_swap_identity();
        const shuffle_swap_t shuf_swap = shuffle_swap_swap();
        
-       const __m128i pn = _mm_set_epi32(0x80000000, 0x80000000, 0x00000000, 
0x00000000);
+       const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 
0x80000000, 0, 0));
        __m128 Psplat[3], idirsplat[3];
 
        Psplat[0] = _mm_set_ps1(P.x);
        Psplat[1] = _mm_set_ps1(P.y);
        Psplat[2] = _mm_set_ps1(P.z);
 
-       idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), _mm_castsi128_ps(pn));
-       idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), _mm_castsi128_ps(pn));
-       idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), _mm_castsi128_ps(pn));
+       idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), pn);
+       idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), pn);
+       idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), pn);
 
        __m128 tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f);
 
@@ -169,7 +169,7 @@ ccl_device bool BVH_FUNCTION_NAME
 
                                /* calculate { c0min, c1min, -c0max, -c1max} */
                                __m128 minmax = _mm_max_ps(_mm_max_ps(tminmaxx, 
tminmaxy), _mm_max_ps(tminmaxz, tsplat));
-                               const __m128 tminmax = _mm_xor_ps(minmax, 
_mm_castsi128_ps(pn));
+                               const __m128 tminmax = _mm_xor_ps(minmax, pn);
 
 #if FEATURE(BVH_HAIR_MINIMUM_WIDTH)
                                if(difl != 0.0f) {
@@ -189,7 +189,7 @@ ccl_device bool BVH_FUNCTION_NAME
                                        }
                                }
 #endif
-                               const __m128 lrhit = _mm_cmple_ps(tminmax, 
shuffle_swap(tminmax, shuf_swap));
+                               const __m128 lrhit = _mm_cmple_ps(tminmax, 
shuffle<2, 3, 0, 1>(tminmax));
 
                                /* decide which nodes to traverse next */
 #ifdef __VISIBILITY_FLAG__
@@ -307,9 +307,9 @@ ccl_device bool BVH_FUNCTION_NAME
                                        Psplat[1] = _mm_set_ps1(P.y);
                                        Psplat[2] = _mm_set_ps1(P.z);
 
-                                       idirsplat[0] = 
_mm_xor_ps(_mm_set_ps1(idir.x), _mm_castsi128_ps(pn));
-                                       idirsplat[1] = 
_mm_xor_ps(_mm_set_ps1(idir.y), _mm_castsi128_ps(pn));
-                                       idirsplat[2] = 
_mm_xor_ps(_mm_set_ps1(idir.z), _mm_castsi128_ps(pn));
+                                       idirsplat[0] = 
_mm_xor_ps(_mm_set_ps1(idir.x), pn);
+                                       idirsplat[1] = 
_mm_xor_ps(_mm_set_ps1(idir.y), pn);
+                                       idirsplat[2] = 
_mm_xor_ps(_mm_set_ps1(idir.z), pn);
 
                                        tsplat = _mm_set_ps(-isect->t, 
-isect->t, 0.0f, 0.0f);
 
@@ -343,9 +343,9 @@ ccl_device bool BVH_FUNCTION_NAME
                        Psplat[1] = _mm_set_ps1(P.y);
                        Psplat[2] = _mm_set_ps1(P.z);
 
-                       idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), 
_mm_castsi128_ps(pn));
-                       idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), 
_mm_castsi128_ps(pn));
-                       idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), 
_mm_castsi128_ps(pn));
+                       idirsplat[0] = _mm_xor_ps(_mm_set_ps1(idir.x), pn);
+                       idirsplat[1] = _mm_xor_ps(_mm_set_ps1(idir.y), pn);
+                       idirsplat[2] = _mm_xor_ps(_mm_set_ps1(idir.z), pn);
 
                        tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f);
 
diff --git a/intern/cycles/kernel/svm/svm_noise.h 
b/intern/cycles/kernel/svm/svm_noise.h
index a58dfdf..6ad10b7 100644
--- a/intern/cycles/kernel/svm/svm_noise.h
+++ b/intern/cycles/kernel/svm/svm_noise.h
@@ -32,21 +32,17 @@
 
 CCL_NAMESPACE_BEGIN
 
-#if defined(__KERNEL_SSE2__)
-#define FMA(a, b, c) _mm_add_ps(_mm_mul_ps((a), (b)), (c))
-#endif
-
 ccl_device int quick_floor(float x)
 {
        return float_to_int(x) - ((x < 0) ? 1 : 0);
 }
 
-#if defined(__KERNEL_SSE2__)
+#ifdef __KERNEL_SSE2__
 ccl_device_inline __m128i quick_floor_sse(const __m128 *x)
 {
-    __m128i b = _mm_cvttps_epi32(*x);
-    __m128i isneg = _mm_castps_si128(_mm_cmplt_ps(*x, _mm_set1_ps(0.0f)));
-    return _mm_add_epi32(b, isneg); // unsaturated add 0xffffffff is the same 
as subtract -1
+       __m128i b = _mm_cvttps_epi32(*x);
+       __m128i isneg = _mm_castps_si128(_mm_cmplt_ps(*x, _mm_set1_ps(0.0f)));
+       return _mm_add_epi32(b, isneg); // unsaturated add 0xffffffff is the 
same as subtract -1
 }
 #endif
 
@@ -84,32 +80,33 @@ ccl_device uint hash(uint kx, uint ky, uint kz)
 #undef final
 }
 
-#if defined(__KERNEL_SSE2__)
+#ifdef __KERNEL_SSE2__
 ccl_device_inline __m128i hash_sse(const __m128i *kx, const __m128i *ky, const 
__m128i *kz)
 {
 #define rot(x,k) _mm_or_si128(_mm_slli_epi32((x), (k)), _mm_srli_epi32((x), 
32-(k)))
 #define xor_rot(a, b, c) do {a = _mm_xor_si128(a, b); a = _mm_sub_epi32(a, 
rot(b, c));} while(0)
 
-    uint len = 3;
-    __m128i magic = _mm_set1_epi32(0xdeadbeef + (len << 2) + 13);
-    __m128i a = _mm_add_epi32(magic, *kx);
-    __m128i b = _mm_add_epi32(magic, *ky);
-    __m128i c = _mm_add_epi32(magic, *kz);
-    
-    xor_rot(c, b, 14);
-    xor_rot(a, c, 11);
-    xor_rot(b, a, 25);
-    xor_rot(c, b, 16);
-    xor_rot(a, c, 4);
-    xor_rot(b, a, 14);
-    xor_rot(c, b, 24);
-
-    return c;
+       uint len = 3;
+       __m128i magic = _mm_set1_epi32(0xdeadbeef + (len << 2) + 13);
+       __m128i a = _mm_add_epi32(magic, *kx);
+       __m128i b = _mm_add_epi32(magic, *ky);
+       __m128i c = _mm_add_epi32(magic, *kz);
+
+       xor_rot(c, b, 14);
+       xor_rot(a, c, 11);
+       xor_rot(b, a, 25);
+       xor_rot(c, b, 16);
+       xor_rot(a, c, 4);
+       xor_rot(b, a, 14);
+       xor_rot(c, b, 24);
+
+       return c;
 #undef rot
 #undef xor_rot
 }
 #endif
 
+#if 0 // unused
 ccl_device int imod(int a, int b)
 {
        a %= b;
@@ -120,48 +117,50 @@ ccl_device uint phash(int kx, int ky, int kz, int3 p)
 {
        return hash(imod(kx, p.x), imod(ky, p.y), imod(kz, p.z));
 }
+#endif
 
+#ifndef __KERNEL_SSE2__
 ccl_device float floorfrac(float x, int* i)
 {
        *i = quick_floor(x);
        return x - *i;
 }
-
-#if defined(__KERNEL_SSE2__)
+#else
 ccl_device_inline __m128 floorfrac_sse(const __m128 *x, __m128i *i)
 {
-    *i = quick_floor_sse(x);
-    return _mm_sub_ps(*x, _mm_cvtepi32_ps(*i));
+       *i = quick_floor_sse(x);
+       return _mm_sub_ps(*x, _mm_cvtepi32_ps(*i));
 }
 #endif
 
+#ifndef __KERNEL_SSE2__
 ccl_device float fade(float t)
 {
        return t * t * t * (t * (t * 6.0f - 15.0f) + 10.0f);
 }
-
-#if defined(__KERNEL_SSE2__)
+#else
 ccl_device_inline __m128 fade_sse(const __m128 *t)
 {
-  __m128 a = FMA(*t, _mm_set1_ps(6.0f), _mm_set1_ps(-15.0f));
-  __m128 b = FMA(*t, a, _mm_set1_ps(10.0f));
-  return _mm_mul_ps(_mm_mul_ps(*t, *t), _mm_mul_ps(*t, b));
+       __m128 a = fma(*t, _mm_set1_ps(6.0f), _mm_set1_ps(-15.0f));
+       __m128 b = fma(*t, a, _mm_set1_ps(10.0f));
+       return _mm_mul_ps(_mm_mul_ps(*t, *t), _mm_mul_ps(*t, b));
 }
 #endif
 
+#ifndef __KERNEL_SSE2__
 ccl_device float nerp(float t, float a, float b)
 {
        return (1.0f - t) * a + t * b;
 }
-
-#if defined(__KERNEL_SSE2__)
+#else
 ccl_device_inline __m128 nerp_sse(const __m128 *t, const __m128 *a, const 
__m128 *b)
 {
-    __m128 x1 = _mm_mul_ps(_mm_sub_ps(_mm_set1_ps(1.0f), *t), *a);
-    return FMA(*t, *b, x1);
+       __m128 x1 = _mm_mul_ps(_mm_sub_ps(_mm_set1_ps(1.0f), *t), *a);
+       return fma(*t, *b, x1);
 }
 #endif
 
+#ifndef __KERNEL_SSE2__
 ccl_device float grad(int hash, float x, float y, float z)
 {


@@ Diff output truncated at 10240 characters. @@

_______________________________________________
Bf-blender-cvs mailing list
[email protected]
http://lists.blender.org/mailman/listinfo/bf-blender-cvs

[Bf-blender-cvs] [da3fdf0] master: Code Cleanup: in Cycles SSE replace macros with templates, skip unused code with preprocessor, simplify casts

Reply via email to