Commit: cfa6753518150a9e4174f4f5c4a1ca188e1d195d Author: Joseph Eagar Date: Tue Mar 30 19:00:08 2021 -0700 Branches: temp_bmesh_multires https://developer.blender.org/rBcfa6753518150a9e4174f4f5c4a1ca188e1d195d
wrote a fast and inaccurate distance to triangle function for DynTopo. Profiling consistently showed it to be a bottleneck. I've now written scalar versions of this function four times. For now I'm committing this inaccuration version. I'm also committing code for a vectorized version I found in a paper. Still need to rejigger the dyntopo code to use it though. =================================================================== M source/blender/blenkernel/CMakeLists.txt A source/blender/blenkernel/intern/closest_point_tri_sse.cc M source/blender/blenkernel/intern/pbvh_bmesh.c M source/blender/blenlib/intern/BLI_ghash_utils.c M source/blender/blenloader/intern/versioning_290.c M source/blender/nodes/geometry/nodes/node_geo_mesh_primitive_cone.cc M source/blender/nodes/geometry/nodes/node_geo_mesh_primitive_cube.cc M source/blender/nodes/geometry/nodes/node_geo_mesh_primitive_ico_sphere.cc M source/blender/nodes/geometry/nodes/node_geo_mesh_primitive_uv_sphere.cc =================================================================== diff --git a/source/blender/blenkernel/CMakeLists.txt b/source/blender/blenkernel/CMakeLists.txt index 8fbf49b31d4..e71c48fcbbc 100644 --- a/source/blender/blenkernel/CMakeLists.txt +++ b/source/blender/blenkernel/CMakeLists.txt @@ -275,6 +275,7 @@ set(SRC intern/workspace.c intern/world.c intern/writeavi.c + intern/closest_point_tri_sse.cc BKE_DerivedMesh.h BKE_action.h diff --git a/source/blender/blenkernel/intern/closest_point_tri_sse.cc b/source/blender/blenkernel/intern/closest_point_tri_sse.cc new file mode 100644 index 00000000000..b42cc370f6e --- /dev/null +++ b/source/blender/blenkernel/intern/closest_point_tri_sse.cc @@ -0,0 +1,396 @@ +// see Fast Distance Queries for Triangles, Lines, and +// Points using SSE Instructions +// http://jcgt.org/published/0003/04/05/paper.pdf + +typedef struct DistRet { + float dist[4]; +} DistRet; + +#if defined(__SSE2__) || defined(__SSE4__) +# include <cstdint> +# include <immintrin.h> +# include <smmintrin.h> +# include <xmmintrin.h> + +struct sseb { + // data + union { + __m128 m128; + uint32_t v[4]; + }; + + sseb() + { + } + + sseb(__m128 m) + { + m128 = m; + } +}; + +struct ssef { + // data + union { + __m128 m128; + float v[4]; + int i[4]; + }; + + ssef(float a, float b, float c, float d) + { + v[0] = a; + v[1] = b; + v[2] = c; + v[3] = d; + } + + ssef() + { + } + + ssef(__m128 f) + { + m128 = f; + } + + ssef(float f) + { + v[0] = v[1] = v[2] = v[3] = f; + } +}; +#endif + +#if defined(__SSE2__) && !defined(__SSE4__) + +__m128 my_mm_blendv_ps(__m128 a, __m128 b, __m128 mask) +{ + ssef fa(a); + ssef fb(b); + sseb fm(mask); + + ssef ret; + for (int i = 0; i < 4; i++) { + if (fm.v[i] & (1 << 31)) { + ret.v[i] = fb.v[i]; + } + else { + ret.v[i] = fa.v[i]; + } + } +} +# define _mm_blendv_ps my_mm_blendv_ps +#endif + +#ifdef __SSE2__ + +# include <array> +# include <vector> + +static inline struct sseb makeSSSEB(__m128 val) +{ + sseb r; + r.m128 = val; + + return r; +} + +// operations +const sseb operator&(const sseb &a, const sseb &b) +{ + return makeSSSEB(_mm_and_ps(a.m128, b.m128)); +} +const sseb operator|(const sseb &a, const sseb &b) +{ + return makeSSSEB(_mm_or_ps(a.m128, b.m128)); +} +const sseb operator|=(const sseb &a, const sseb &b) +{ + return a | b; +} +const sseb operator^(const sseb &a, const sseb &b) +{ + return makeSSSEB(_mm_xor_ps(a.m128, b.m128)); +} +bool all(const sseb &b) +{ + return _mm_movemask_ps(b.m128) == 0xf; +} +bool any(const sseb &b) +{ + return _mm_movemask_ps(b.m128) != 0x0; +} +bool none(const sseb &b) +{ + return _mm_movemask_ps(b.m128) == 0x0; +} +static inline struct ssef makeSSSEF(__m128 val) +{ + ssef r; + r.m128 = val; + + return r; +} + +// operations +const ssef operator+(const ssef &a, const ssef &b) +{ + return makeSSSEF(_mm_add_ps(a.m128, b.m128)); +} +const ssef operator-(const ssef &a, const ssef &b) +{ + return makeSSSEF(_mm_sub_ps(a.m128, b.m128)); +} +const ssef operator*(const ssef &a, const ssef &b) +{ + return makeSSSEF(_mm_mul_ps(a.m128, b.m128)); +} +const ssef operator/(const ssef &a, const ssef &b) +{ + return makeSSSEF(_mm_div_ps(a.m128, b.m128)); +} + +const sseb operator>=(const ssef &a, const ssef &b) +{ + __m128 r1 = _mm_cmpgt_ss(a.m128, b.m128); + __m128 r2 = _mm_cmpeq_ss(a.m128, b.m128); + + return makeSSSEB(_mm_or_ps(r1, r2)); +} +const sseb operator<=(const ssef &a, const ssef &b) +{ + __m128 r1 = _mm_cmplt_ss(a.m128, b.m128); + __m128 r2 = _mm_cmpeq_ss(a.m128, b.m128); + + return makeSSSEB(_mm_or_ps(r1, r2)); +} + +const sseb operator>(const ssef &a, const ssef &b) +{ + return makeSSSEB(_mm_cmpgt_ss(a.m128, b.m128)); +} + +const sseb operator<(const ssef &a, const ssef &b) +{ + return makeSSSEB(_mm_cmplt_ss(a.m128, b.m128)); +} + +const ssef min(const ssef &a, const ssef &b) +{ + return makeSSSEF(_mm_min_ps(a.m128, b.m128)); +} +const ssef sqr(const ssef &a) +{ + return makeSSSEF(_mm_mul_ps(a.m128, a.m128)); +} +const ssef sqrt(const ssef &a) +{ + return makeSSSEF(_mm_sqrt_ps(a.m128)); +} +const ssef select(const sseb &mask, const ssef &t, const ssef &f) +{ + return makeSSSEF(_mm_blendv_ps(f.m128, t.m128, mask.m128)); +} + +template<typename T> struct Vec3 { + // data + T x, y, z; + + Vec3(T x, T y, T z) : x(x), y(y), z(z) + { + } + + Vec3(T v) : x(v), y(v), z(v) + { + } +}; + +// operations +template<typename T> Vec3<T> operator+(const Vec3<T> &a, const Vec3<T> &b) +{ + return Vec3<T>(a.x + b.x, a.y + b.y, a.z + b.z); +} +template<typename T> Vec3<T> operator-(const Vec3<T> &a, const Vec3<T> &b) +{ + return Vec3<T>(a.x - b.x, a.y - b.y, a.z - b.z); +} +template<typename T> Vec3<T> operator*(const Vec3<T> &a, const Vec3<T> &b) +{ + return Vec3<T>(a.x * b.x, a.y * b.y, a.z * b.z); +} +template<typename T> Vec3<T> operator*(const T &a, const Vec3<T> &b) +{ + return Vec3<T>(b.x * a, b.y * a, b.z * a); +} +template<typename T> Vec3<T> operator*(const Vec3<T> &b, const T &a) +{ + return Vec3<T>(b.x * a, b.y * a, b.z * a); +} + +template<typename T> inline Vec3<T> rcp(const Vec3<T> &a) +{ + return Vec3<T>(rcp(a.x), rcp(a.y), rcp(a.z)); +} +template<typename T> inline Vec3<T> rsqrt(const Vec3<T> &a) +{ + return Vec3<T>(rsqrt(a.x), rsqrt(a.y), rsqrt(a.z)); +} +template<typename T> T dot(const Vec3<T> &a, const Vec3<T> &b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z; +} +template<typename T> T length2b(const Vec3<T> &a) +{ + return dot(a, a); +} + +typedef sseb simdBool; +typedef ssef simdFloat; +typedef Vec3<ssef> simdFloatVec; +typedef std::array<simdFloatVec, 3> simdTriangle_type; +typedef std::array<simdFloatVec, 2> simdLine_type; +typedef simdFloatVec simdPoint_type; + +static inline simdFloat length2(const simdFloatVec &a) +{ + return a.x * a.x + a.y * a.y + a.z * a.z; +} + +static simdFloat zero = {0}; +static simdFloat one = {1.0f, 1.0f, 1.0f, 1.0f}; + +const simdFloatVec select(const sseb &mask, const simdFloatVec &t, const simdFloatVec &f) +{ + simdFloatVec r2(select(mask, t.x, f.x), select(mask, t.y, f.y), select(mask, t.z, f.z)); + + return r2; +} + +template<typename T> T clamp(const T &x, const T &lower, const T &upper) +{ + return max(lower, min(x, upper)); +} + +const simdFloat simdTriPoint2(simdFloatVec &oTriPoint, + const simdTriangle_type &iTri, + const simdPoint_type &iPoint) +{ + const simdFloatVec ab = iTri[1] - iTri[0]; + const simdFloatVec ac = iTri[2] - iTri[0]; + const simdFloatVec ap = iPoint - iTri[0]; + const simdFloat d1 = dot(ab, ap); + const simdFloat d2 = dot(ac, ap); + const simdBool mask1 = (d1 <= simdFloat(zero)) & (d2 <= simdFloat(zero)); + oTriPoint = iTri[0]; + simdBool exit(mask1); + if (all(exit)) + return length2(oTriPoint - iPoint); + + const simdFloatVec bp = iPoint - iTri[1]; + const simdFloat d3 = dot(ab, bp); + const simdFloat d4 = dot(ac, bp); + const simdBool mask2 = (d3 >= simdFloat(zero)) & (d4 <= d3); + // Closest point is the point iTri[1]. Update if necessary. + oTriPoint = select(exit, oTriPoint, select(mask2, iTri[1], oTriPoint)); + exit = exit | mask2; + if (all(exit)) + return length2(oTriPoint - iPoint); + + const simdFloatVec cp = iPoint - iTri[2]; + const simdFloat d5 = dot(ab, cp); + const simdFloat d6 = dot(ac, cp); + const simdBool mask3 = (d6 >= simdFloat(zero)) & (d5 <= d6); + // Closest point is the point iTri[2]. Update if necessary. + oTriPoint = select(exit, oTriPoint, select(mask3, iTri[2], oTriPoint)); + exit |= mask3; + if (all(exit)) + return length2(oTriPoint - iPoint); + + const simdFloat vc = d1 * d4 - d3 * d2; + const simdBool mask4 = (vc <= simdFloat(zero)) & (d1 >= simdFloat(zero)) & + (d3 <= simdFloat(zero)); + const simdFloat v1 = d1 / (d1 - d3); + const simdFloatVec answer1 = iTri[0] + v1 * ab; + // Closest point is on the line ab. Update if necessary. + oTriPoint = select(exit, oTriPoint, select(mask4, answer1, oTriPoint)); + exit |= mask4; + if (all(exit)) + return length2(oTriPoint - iPoint); + + const simdFloat vb = d5 * d2 - d1 * d6; + const simdBool mask5 = (vb <= simdFloat(zero)) & (d2 >= simdFloat(zero)) & + (d6 <= simdFloat(zero)); + const simdFloat w1 = d2 / (d2 - d6); + const simdFloatVec answer2 = iTri[0] + w1 * ac; + // Closest point is on the line ac. Update if necessary. + oTriPoint = select(exit, oTriPoint, select(mask5, answer2, oTriPoint)); + exit |= mask5; + if (all(exit)) + return length2(oTriPoint - iPoint); + + const simdFloat va = d3 * d6 - d5 * d4; + const simdBool mask6 = (va <= simdFloat(zero)) & ((d4 - d3) >= simdFloat(zero)) & + ((d5 - d6) >= simdFloat(zero)); + simdFloat w2 = (d4 - d3) / ((d4 - d3) + (d5 - d6)); + const simdFloatVec answer3 = iTri[1] + w2 * (iTri[2] - iTri[1]); + // Closest point is on the line bc. Update if necessary. + oTriPoint = select(exit, oTriPoint, select(mask6, answer3, oTriPoint)); + exit |= mask6; + if (all(exit)) + return length2(oTriPoint - iPoint); + + const simdFloat denom = simdFloat(one) / (va + vb + vc); + const simdFloat v2 = vb * denom; + const simdFloat w3 = vc * denom; + const simdFloatVec answer4 = iTri[0] + ab * v2 + ac * w3; + const simdBool mask7 = length2(answer4 - iPoint) < length2(oTriPoint - iPoint); + // Closest point is inside triangle. Update if necessary. + oTriPoint = select(exit, oTriPoint, select(mask7, answer4, oTriPoint)); + return length2(oTriPoint - iPoint); +} + +extern "C" struct DistRet dist_to_tri_sphere_fast_4( + float p[3], float v1[4][3], float v2[4][3], float v3[4][3], float n[4][3]) +{ + simdFloatVec mp((simdFloat(p[0]), simdFloat(p[1]), simdFloat(p[2]))); + simdFloatVec t1(simdFloat(v1[0][0], v1[1][0], v1[2][0], v1[3][0]), + simdFloat(v1[0][1], v1[1][1], v1[2][1], v1[3][1]), + simdFloat(v1[0][2], v1[1][2], v1[2][2], v1[3][2])); + simdFloatVec t2(simdFloat(v2[0][0], v2[1][0], v2[2][0], v2[3][0]), + simdFloat(v2[0][1], v2[1][1], v2[2][1], v2[3][1]), + simdFloat(v2[0][2], v2[1][2], v2[2][2], v2[3][2])); + simdFloatVec t3(simdFloat(v3[ @@ Diff output truncated at 10240 characters. @@ _______________________________________________ Bf-blender-cvs mailing list [email protected] https://lists.blender.org/mailman/listinfo/bf-blender-cvs
