Commit: cd5e1ff74e4f6443f3e4b836dd23fe46b56cb7ed
Author: Thomas Dinges
Date:   Fri Jun 13 21:13:18 2014 +0200
https://developer.blender.org/rBcd5e1ff74e4f6443f3e4b836dd23fe46b56cb7ed

Cycles Refactor: Add SSE Utility code from Embree for cleaner SSE code.

This makes the code a bit easier to understand, and might come in handy
if we want to reuse more Embree code.

Differential Revision: https://developer.blender.org/D482

Code by Brecht, with fixes by Lockal, Sergey and myself.

===================================================================

M       intern/cycles/kernel/geom/geom_bvh_shadow.h
M       intern/cycles/kernel/geom/geom_bvh_subsurface.h
M       intern/cycles/kernel/geom/geom_bvh_traversal.h
M       intern/cycles/kernel/geom/geom_curve.h
M       intern/cycles/kernel/kernel_avx.cpp
M       intern/cycles/kernel/kernel_compat_cpu.h
M       intern/cycles/kernel/svm/svm_image.h
M       intern/cycles/kernel/svm/svm_noise.h
M       intern/cycles/kernel/svm/svm_texture.h
M       intern/cycles/render/curves.cpp
M       intern/cycles/render/tile.cpp
M       intern/cycles/util/CMakeLists.txt
M       intern/cycles/util/util_color.h
M       intern/cycles/util/util_half.h
M       intern/cycles/util/util_optimization.h
A       intern/cycles/util/util_simd.cpp
M       intern/cycles/util/util_simd.h
A       intern/cycles/util/util_sseb.h
A       intern/cycles/util/util_ssef.h
A       intern/cycles/util/util_ssei.h
M       intern/cycles/util/util_types.h

===================================================================

diff --git a/intern/cycles/kernel/geom/geom_bvh_shadow.h 
b/intern/cycles/kernel/geom/geom_bvh_shadow.h
index 98bf82b..48876da 100644
--- a/intern/cycles/kernel/geom/geom_bvh_shadow.h
+++ b/intern/cycles/kernel/geom/geom_bvh_shadow.h
@@ -68,15 +68,15 @@ ccl_device bool BVH_FUNCTION_NAME
        const shuffle_swap_t shuf_identity = shuffle_swap_identity();
        const shuffle_swap_t shuf_swap = shuffle_swap_swap();
        
-       const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 
0x80000000, 0, 0));
-       __m128 Psplat[3], idirsplat[3];
+       const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+       ssef Psplat[3], idirsplat[3];
        shuffle_swap_t shufflexyz[3];
 
-       Psplat[0] = _mm_set_ps1(P.x);
-       Psplat[1] = _mm_set_ps1(P.y);
-       Psplat[2] = _mm_set_ps1(P.z);
+       Psplat[0] = ssef(P.x);
+       Psplat[1] = ssef(P.y);
+       Psplat[2] = ssef(P.z);
 
-       __m128 tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+       ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t);
 
        gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, 
shufflexyz);
 #endif
@@ -132,27 +132,27 @@ ccl_device bool BVH_FUNCTION_NAME
                                /* Intersect two child bounding boxes, SSE3 
version adapted from Embree */
 
                                /* fetch node data */
-                               const __m128 *bvh_nodes = 
(__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
+                               const ssef *bvh_nodes = 
(ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
                                const float4 cnodes = ((float4*)bvh_nodes)[3];
 
                                /* intersect ray against child nodes */
-                               const __m128 tminmaxx = 
_mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), 
idirsplat[0]);
-                               const __m128 tminmaxy = 
_mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), 
idirsplat[1]);
-                               const __m128 tminmaxz = 
_mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), 
idirsplat[2]);
+                               const ssef tminmaxx = 
(shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
+                               const ssef tminmaxy = 
(shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
+                               const ssef tminmaxz = 
(shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
 
                                /* calculate { c0min, c1min, -c0max, -c1max} */
-                               __m128 minmax = _mm_max_ps(_mm_max_ps(tminmaxx, 
tminmaxy), _mm_max_ps(tminmaxz, tsplat));
-                               const __m128 tminmax = _mm_xor_ps(minmax, pn);
-                               const __m128 lrhit = _mm_cmple_ps(tminmax, 
shuffle<2, 3, 0, 1>(tminmax));
+                               const ssef minmax = max(max(tminmaxx, 
tminmaxy), max(tminmaxz, tsplat));
+                               const ssef tminmax = minmax ^ pn;
+                               const sseb lrhit = tminmax <= shuffle<2, 3, 0, 
1>(tminmax);
 
                                /* decide which nodes to traverse next */
 #ifdef __VISIBILITY_FLAG__
                                /* this visibility test gives a 5% performance 
hit, how to solve? */
-                               traverseChild0 = (_mm_movemask_ps(lrhit) & 1) 
&& (__float_as_uint(cnodes.z) & PATH_RAY_SHADOW);
-                               traverseChild1 = (_mm_movemask_ps(lrhit) & 2) 
&& (__float_as_uint(cnodes.w) & PATH_RAY_SHADOW);
+                               traverseChild0 = (movemask(lrhit) & 1) && 
(__float_as_uint(cnodes.z) & PATH_RAY_SHADOW);
+                               traverseChild1 = (movemask(lrhit) & 2) && 
(__float_as_uint(cnodes.w) & PATH_RAY_SHADOW);
 #else
-                               traverseChild0 = (_mm_movemask_ps(lrhit) & 1);
-                               traverseChild1 = (_mm_movemask_ps(lrhit) & 2);
+                               traverseChild0 = (movemask(lrhit) & 1);
+                               traverseChild1 = (movemask(lrhit) & 2);
 #endif
 #endif // __KERNEL_SSE2__
 
@@ -164,9 +164,7 @@ ccl_device bool BVH_FUNCTION_NAME
 #if !defined(__KERNEL_SSE2__)
                                        bool closestChild1 = (c1min < c0min);
 #else
-                                       union { __m128 m128; float v[4]; } 
uminmax;
-                                       uminmax.m128 = tminmax;
-                                       bool closestChild1 = uminmax.v[1] < 
uminmax.v[0];
+                                       bool closestChild1 = tminmax[1] < 
tminmax[0];
 #endif
 
                                        if(closestChild1) {
@@ -301,12 +299,12 @@ ccl_device bool BVH_FUNCTION_NAME
                                        num_hits_in_instance = 0;
 
 #if defined(__KERNEL_SSE2__)
-                                       Psplat[0] = _mm_set_ps1(P.x);
-                                       Psplat[1] = _mm_set_ps1(P.y);
-                                       Psplat[2] = _mm_set_ps1(P.z);
+                                       Psplat[0] = ssef(P.x);
+                                       Psplat[1] = ssef(P.y);
+                                       Psplat[2] = ssef(P.z);
 
                                        isect_array->t = isect_t;
-                                       tsplat = _mm_set_ps(-isect_t, -isect_t, 
0.0f, 0.0f);
+                                       tsplat = ssef(0.0f, 0.0f, -isect_t, 
-isect_t);
 
                                        gen_idirsplat_swap(pn, shuf_identity, 
shuf_swap, idir, idirsplat, shufflexyz);
 #endif
@@ -348,13 +346,13 @@ ccl_device bool BVH_FUNCTION_NAME
                        }
 
 #if defined(__KERNEL_SSE2__)
-                       Psplat[0] = _mm_set_ps1(P.x);
-                       Psplat[1] = _mm_set_ps1(P.y);
-                       Psplat[2] = _mm_set_ps1(P.z);
+                       Psplat[0] = ssef(P.x);
+                       Psplat[1] = ssef(P.y);
+                       Psplat[2] = ssef(P.z);
 
                        isect_t = tmax;
                        isect_array->t = isect_t;
-                       tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+                       tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
 
                        gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, 
idirsplat, shufflexyz);
 #endif
diff --git a/intern/cycles/kernel/geom/geom_bvh_subsurface.h 
b/intern/cycles/kernel/geom/geom_bvh_subsurface.h
index a19f05d..a8f57cf 100644
--- a/intern/cycles/kernel/geom/geom_bvh_subsurface.h
+++ b/intern/cycles/kernel/geom/geom_bvh_subsurface.h
@@ -65,15 +65,15 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const 
Ray *ray, Intersectio
        const shuffle_swap_t shuf_identity = shuffle_swap_identity();
        const shuffle_swap_t shuf_swap = shuffle_swap_swap();
        
-       const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 
0x80000000, 0, 0));
-       __m128 Psplat[3], idirsplat[3];
+       const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+       ssef Psplat[3], idirsplat[3];
        shuffle_swap_t shufflexyz[3];
 
-       Psplat[0] = _mm_set_ps1(P.x);
-       Psplat[1] = _mm_set_ps1(P.y);
-       Psplat[2] = _mm_set_ps1(P.z);
+       Psplat[0] = ssef(P.x);
+       Psplat[1] = ssef(P.y);
+       Psplat[2] = ssef(P.z);
 
-       __m128 tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+       ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t);
 
        gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, 
shufflexyz);
 #endif
@@ -131,25 +131,27 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, 
const Ray *ray, Intersectio
                                /* Intersect two child bounding boxes, SSE3 
version adapted from Embree */
 
                                /* fetch node data */
-                               const __m128 *bvh_nodes = 
(__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
+                               const ssef *bvh_nodes = 
(ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
                                const float4 cnodes = ((float4*)bvh_nodes)[3];
 
                                /* intersect ray against child nodes */
-                               const __m128 tminmaxx = 
_mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), 
idirsplat[0]);
-                               const __m128 tminmaxy = 
_mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), 
idirsplat[1]);
-                               const __m128 tminmaxz = 
_mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), 
idirsplat[2]);
+                               const ssef tminmaxx = 
(shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
+                               const ssef tminmaxy = 
(shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
+                               const ssef tminmaxz = 
(shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
 
-                               const __m128 tminmax = 
_mm_xor_ps(_mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, 
tsplat)), pn);
-                               const __m128 lrhit = _mm_cmple_ps(tminmax, 
shuffle<2, 3, 0, 1>(tminmax));
+                               /* calculate { c0min, c1min, -c0max, -c1max} */
+                               const ssef minmax = max(max(tminmaxx, 
tminmaxy), max(tminmaxz, tsplat));
+                               const ssef tminmax = minmax ^ pn;
+                               const sseb lrhit = tminmax <= shuffle<2, 3, 0, 
1>(tminmax);
 
                                /* decide which nodes to traverse next */
 #ifdef __VISIBILITY_FLAG__
                                /* this visibility test gives a 5% performance 
hit, how to solve? */
-                               traverseChild0 = (_mm_movemask_ps(lrhit) & 1) 
&& (__float_as_uint(cnodes.z) & visibility);
-                               traverseChild1 = (_mm_movemask_ps(lrhit) & 2) 
&& (__float_as_uint(cnodes.w) & visibility);
+                               traverseChild0 = (movemask(lrhit) & 1) && 
(__float_as_uint(cnodes.z) & visibility);
+                               traverseChild1 = (movemask(lrhit) & 2) && 
(__float_as_uint(cnodes.w) & visibility);
 #else
-                               traverseChild0 = (_mm_movemask_ps(lrhit) & 1);
-                               traverseChild1 = (_mm_movemask_ps(lrhit) & 2);
+                               traverseChild0 = (movemask(lrhit) & 1);
+                               traverseChild1 = (movemask(lrhit) & 2);
 #endif
 #endif // __KERNEL_SSE2__
 
@@ -161,9 +163,7 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const 
Ray *ray, Intersectio
 #if !defined(__KERNEL_SSE2__)
                                        bool closestChild1 = (c1min < c0min);
 #else
-                                       union { __m128 m128; float v[4]; } 
uminmax;
-                                       uminmax.m128 = tminmax;
-                                       bool closestChild1 = uminmax.v[1] < 
uminmax.v[0];
+                                       bool closestChild1 = tminmax[1] < 
tminmax[0];
 #endif
 
                                        if(closestChild1) {
@@ -243,11 +243,11 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, 
const Ray *ray, Intersectio
 #endif
 
 #if defined(__KERNEL_SSE2__)
-                                               Psplat[0] = _mm_set_ps1(P.x);
-                                               Psplat[1] = _mm_set_ps1(P.y);
-                                               Psplat[2] = _mm_set_ps1(P.z);
+                                               Psplat[0] = ssef(P.x);
+                                               Psplat[1] = ssef(P.y);
+                                               Psplat[2] = ssef(P.z);
 
-                                               tsplat = _mm_set_ps(-isect_t, 
-isect_t, 0.0f, 0.0f);
+                                               tsplat = ssef(0.0f, 0.0f, 
-isect_t, -isect_t);
 
                                                gen_idirsplat_swap(pn, 
shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
@@ -279,11 +279,11 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, 
const Ray *ray, Intersectio
 #endif
 
 #if defined(__KERNEL_SSE2__)
-                       Psplat[0] = _mm_set_ps1(P.x);
-                       Psplat[1] = _mm_set_ps1(P.y);
-                       Psplat[2] = _mm_set_ps1(P.z);
+                       Psplat[0] = ssef(P.x);
+                       Psplat[1] = ssef(P.y);
+                       Psplat[2] = ssef(P.z);
 
-                       tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+                       tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
 
                        gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, 
idirsplat, shufflexyz);
 #endif
diff --git a/intern/cycles/kernel/geom/geom_bvh_traversal.h 
b/intern/cycles/kernel/geom/geom_bvh_traversal.h
index 9fd40f9..e39228c 100644
--- a/intern/cycles/kernel/geom/geom_bvh_traversal.h
+++ b/intern/cycles/kernel/geom/geom_bvh_traversal.h
@@ -72,15 +72,15 @@ ccl_device bool BVH_FUNCTION_NAME
        const shuffle_swap_t shuf_identity = shuffle_swap_identity();
        const shuffle_swap_t shuf_swap = shuffle_swap_swap();
        
-       const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 
0x80000000, 0, 0));
-       __m128 Psp

@@ Diff output truncated at 10240 characters. @@

_______________________________________________
Bf-blender-cvs mailing list
Bf-blender-cvs@blender.org
http://lists.blender.org/mailman/listinfo/bf-blender-cvs

Reply via email to