[Bug libstdc++/77776] C++17 std::hypot implementation is poor
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6 --- Comment #29 from g.peterh...@t-online.de --- (In reply to Jakub Jelinek from comment #28) > As long as the scale is a power of two or 1.0 / power of two, I don't see > why any version wouldn't be inaccurate. Yes, but the constant scale_up is incorrectly selected. scale_up = std::exp2(Type(limits::max_exponent-1)) --> ok scale_up = std::exp2(Type(limits::max_exponent/2)) --> error scale_up = prev_power2(sqrt_max) --> error scale_down = std::exp2(Type(limits::min_exponent-1)) also seems to me to be more favorable. PS: There seems to be a problem with random numbers and std::float16_t, which is why I use std::uniform_real_distribution. I have not yet found out exactly where the error lies. thx Gero template inline constexpr Type hypot_exp(Type x, Type y, Type z) noexcept { using limits = std::numeric_limits; constexpr Type zero = 0; x = std::abs(x); y = std::abs(y); z = std::abs(z); if (!(std::isnormal(x) && std::isnormal(y) && std::isnormal(z))) [[unlikely]] { if (std::isinf(x) | std::isinf(y) | std::isinf(z)) return limits::infinity(); else if (std::isnan(x) | std::isnan(y) | std::isnan(z)) return limits::quiet_NaN(); else { const bool xz{x == zero}, yz{y == zero}, zz{z == zero}; if (xz) { if (yz) return zz ? zero : z; else if (zz)return y; } else if (yz && zz) return x; } } if (x > z) std::swap(x, z); if (y > z) std::swap(y, z); int exp; z = std::frexp(z, &exp); y = std::ldexp(y, -exp); x = std::ldexp(x, -exp); return std::ldexp(std::sqrt(__builtin_assoc_barrier(x*x + y*y) + z*z), exp); } template inline constexpr Type hypot_gp(Type x, Type y, Type z)noexcept { using limits = std::numeric_limits; constexpr Type sqrt_min= std::sqrt(limits::min()), sqrt_max= std::sqrt(limits::max()), scale_up= std::exp2(Type(limits::max_exponent-1)), scale_down = std::exp2(Type(limits::min_exponent-1)), zero= 0; x = std::abs(x); y = std::abs(y); z = std::abs(z); if (!(std::isnormal(x) && std::isnormal(y) && std::isnormal(z))) [[unlikely]] { if (std::isinf(x) | std::isinf(y) | std::isinf(z)) return limits::infinity(); else if (std::isnan(x) | std::isnan(y) | std::isnan(z)) return limits::quiet_NaN(); else { const bool xz{x == zero}, yz{y == zero}, zz{z == zero}; if (xz) { if (yz) return zz ? zero : z; else if (zz)return y; } else if (yz && zz) return x; } } if (x > z) std::swap(x, z); if (y > z) std::swap(y, z); if (const bool b{z>=sqrt_min}; b && z<=sqrt_max) [[likely]] { // no scale return std::sqrt(__builtin_assoc_barrier(x*x + y*y) + z*z); } else { const Type scale = b ? scale_down : scale_up; x *= scale; y *= scale; z *= scale; return std::sqrt(__builtin_assoc_barrier(x*x + y*y) + z*z) / scale; } } template voidtest(const size_t count, const Type min, const Type max, const Type factor) { std::random_device rd{}; std::mt19937 gen{rd()}; std::uniform_real_distribution dis{min, max}; auto rnd = [&]() noexcept -> Type { return Type(dis(gen) * factor); }; for (size_t i=0; i; test(1024*1024, 0.5, 1, limits::max()); test(1024*1024, 0, 1, limits::min()); return EXIT_SUCCESS; }
[Bug libstdc++/77776] C++17 std::hypot implementation is poor
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6 --- Comment #28 from Jakub Jelinek --- As long as the scale is a power of two or 1.0 / power of two, I don't see why any version wouldn't be inaccurate.
[Bug libstdc++/77776] C++17 std::hypot implementation is poor
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6 --- Comment #27 from g.peterh...@t-online.de --- Hi Matthias, thanks for your benchmark. I still have 2 questions: 1) Accuracy The frexp/ldexp variant seems to be the most accurate; is that correct? Then other constants would have to be used in hypot_gp: scale_up = std::exp2(Type(limits::max_exponent-1)) scale_down = std::exp2(Type(limits::min_exponent-1)) 2) Speed Your benchmark outputs several columns (Δ)Latency/(Δ)Throughput/Speedup. What exactly do the values stand for; what should be optimized for? thx Gero
[Bug libstdc++/77776] C++17 std::hypot implementation is poor
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6 --- Comment #26 from g.peterh...@t-online.de --- must of course be "... / scale". How can I still edit posts?
[Bug libstdc++/77776] C++17 std::hypot implementation is poor
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6 --- Comment #25 from g.peterh...@t-online.de --- Hi Matthias, to get good results on average (all FP-types: (B)FP16..FP128, scalar/vectorized(SIMD)/parallel/...) this algorithm seems to me (so far) to be suitable: template inline constexpr Type hypot_gp(Type x, Type y, Type z)noexcept { using limits = std::numeric_limits; constexpr Type sqrt_min= std::sqrt(limits::min()), sqrt_max= std::sqrt(limits::max()), scale_up= std::exp2( Type(limits::max_exponent/2)), scale_down = std::exp2(-Type(limits::max_exponent/2)), zero= 0; x = std::abs(x); y = std::abs(y); z = std::abs(z); if (!(std::isnormal(x) && std::isnormal(y) && std::isnormal(z))) [[unlikely]] { if (std::isinf(x) | std::isinf(y) | std::isinf(z)) return limits::infinity(); else if (std::isnan(x) | std::isnan(y) | std::isnan(z)) return limits::quiet_NaN(); else { const bool xz{x == zero}, yz{y == zero}, zz{z == zero}; if (xz) { if (yz) return zz ? zero : z; else if (zz)return y; } else if (yz && zz) return x; } } if (x > z) std::swap(x, z); if (y > z) std::swap(y, z); if ((z >= sqrt_min) && (z <= sqrt_max)) [[likely]] { // no scale return std::sqrt(__builtin_assoc_barrier(x*x + y*y) + z*z); } else { const Type scale = (z >= sqrt_min) ? scale_down : scale_up; x *= scale; y *= scale; z *= scale; return std::sqrt(__builtin_assoc_barrier(x*x + y*y) + z*z); } }
[Bug libstdc++/77776] C++17 std::hypot implementation is poor
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6 --- Comment #24 from Matthias Kretz (Vir) --- (In reply to g.peterhoff from comment #23) > * How do you create the benchmarks? https://github.com/mattkretz/simd-benchmarks Look at hypot3.cpp :)
[Bug libstdc++/77776] C++17 std::hypot implementation is poor
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6 --- Comment #23 from g.peterh...@t-online.de --- Hello Matthias, you've given me new ideas. I think we agree on implementing hypot3 using a scaling factor. But the correct value is not yet implemented here either; do you have a suggestion? A version here: https://godbolt.org/z/Gd53cG9YG I've intentionally broken hypot_gp into small pieces so that you can play around with it. This is of course unnecessary for a final version. General * The function must of course work efficiently with all FP types. Questions * Sorting: It is theoretically sufficient to sort the values x,y,z only to the extent that the condition x,y <= z is fulfilled (HYPOT_SORT_FULL). * Accuracy: This is better with fma (HYPOT_FMA). * How do you create the benchmarks? I could do this myself without getting on your nerves. thx Gero
[Bug libstdc++/77776] C++17 std::hypot implementation is poor
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6 --- Comment #22 from Matthias Kretz (Vir) --- I took your hypot3_scale and reduced latency and throughput. I don't think the sqrtmax/sqrtmin limits are correct (sqrtmax² * 3 -> infinity). TYPE Latency Speedup Throughput Speedup [cycles/call] [per value] [cycles/call] [per value] float,46.5 1 12.7 1 float, hypot3_scale 35.51.31 27 0.47 float, hypot3_mkretz 30.21.54 12 1.06 -- TYPE Latency Speedup Throughput Speedup [cycles/call] [per value] [cycles/call] [per value] double,59.6 1 60 1 double, hypot3_scale 51.21.16 48.8 1.23 double, hypot3_mkretz 40.11.49 35 1.71 template constexpr T hypot(T x, T y, T z) { using limits = std::numeric_limits; auto prev_power2 = [](const T value) constexpr noexcept -> T { return std::exp2(std::floor(std::log2(value))); }; constexpr T sqrtmax = std::sqrt(limits::max()); constexpr T sqrtmin = std::sqrt(limits::min()); constexpr T scale_up = prev_power2(sqrtmax); constexpr T scale_down = T(1) / scale_up; constexpr T zero = 0; if (not (std::isnormal(x) && std::isnormal(y) && std::isnormal(z))) [[unlikely]] { if (std::isinf(x) | std::isinf(y) | std::isinf(z)) return limits::infinity(); else if (std::isnan(x) | std::isnan(y) | std::isnan(z)) return limits::quiet_NaN(); const bool xz = x == zero; const bool yz = y == zero; const bool zz = z == zero; if (xz) { if (yz) return zz ? zero : z; else if (zz) return y; } else if (yz && zz) return x; } x = std::abs(x); y = std::abs(y); z = std::abs(z); T a = std::max(std::max(x, y), z); T b = std::min(std::max(x, y), z); T c = std::min(x, y); if (a >= sqrtmin && a <= sqrtmax) [[likely]] return std::sqrt(__builtin_assoc_barrier(c * c + b * b) + a * a); const T scale = a >= sqrtmin ? scale_down : scale_up; a *= scale; b *= scale; c *= scale; return std::sqrt(__builtin_assoc_barrier(c * c + b * b) + a * a) / scale; }
[Bug libstdc++/77776] C++17 std::hypot implementation is poor
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6 --- Comment #21 from Jonathan Wakely --- (In reply to g.peterhoff from comment #19) > * You were probably wondering why I wrote "if (std::isinf(x) | std::isinf(y) > | std::isinf(z))", for example. This is intentional. The problem is that gcc > almost always produces branch code for logical operations, so *a lot* of > conditional jumps. By using arithmetic operations, so instead of || && just > | &, I can get it to generate only actually necessary conditional jumps or > cmoves. branchfree code is always better. In general the code is not equivalent if any of the subexpressions has side effects, such as setting errno, raising an exception for a signalling nan etc. but I don't think we care about that here.
[Bug libstdc++/77776] C++17 std::hypot implementation is poor
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6 --- Comment #20 from Matthias Kretz (Vir) --- Thanks, I'd be very happy if such a relatively clear implementation could make it! > branchfree code is always better. Don't say it like that. Smart branching, making use of how static branch-prediction works, can speed up code significantly. You don't want to compute everything when 99.9% of the inputs need only a fraction of the work. TYPE Latency Speedup Throughput Speedup [cycles/call] [per value] [cycles/call] [per value] float, simd_abi::scalar 48.1 1 17 1 float, std::hypot 43.31.11 12.3 1.39 float, hypot3_scale 31.71.52 22.3 0.764 float, hypot3_exp 83.9 0.574 84.5 0.201 -- TYPE Latency Speedup Throughput Speedup [cycles/call] [per value] [cycles/call] [per value] double, simd_abi::scalar 54.7 1 15 1 double, std::hypot 53.81.02 19 0.79 double, hypot3_scale 441.24 24 0.625 double, hypot3_exp 91.3 0.599 91 0.165 and with -ffast-math: TYPE Latency Speedup Throughput Speedup [cycles/call] [per value] [cycles/call] [per value] float, simd_abi::scalar 48.9 1 9.15 1 float, std::hypot 53.2 0.918 8.31 1.1 float, hypot3_scale 31.31.56 14 0.652 float, hypot3_exp 55.9 0.874 21.5 0.425 -- TYPE Latency Speedup Throughput Speedup [cycles/call] [per value] [cycles/call] [per value] double, simd_abi::scalar 54.8 1 9.07 1 double, std::hypot 61.5 0.891 11.3 0.805 double, hypot3_scale 40.81.34 12.1 0.753 double, hypot3_exp 64.2 0.853 23.3 0.39 I have not tested correctness or precision yet. Also, the benchmark only uses inputs that do not require anything else than √x²+y²+z² (which, I believe, should be the common input and thus optimized for).
[Bug libstdc++/77776] C++17 std::hypot implementation is poor
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6 --- Comment #19 from g.peterh...@t-online.de --- > So, no need to use frexp/ldexp, just comparisons of hi above against sqrt of > (max finite / 3), in that case scale by multiplying all 3 args by some > appropriate scale constant, and similarly otherwise if lo1 is too small by > some large scale. I don't really know. With frexp/ldexp you probably get the highest accuracy (even if it is probably slower) instead of doing it manually. The problem is to determine suitable scaling factors and to adjust the (return)values accordingly. I have implemented both cases. Error * In the case (x==y && y==z), x*std::sqrt(T(3)) must not simply be returned, as this can lead to an overflow (inf). Generally * Instead of using fmin/fmax to determine the values hi,lo1,lo0, it is better to sort x,y,z. This is faster and clearer and no additional variables need to be introduced. * It also makes sense to consider the case (x==0 && y==0 && z==0). Optimizations * You were probably wondering why I wrote "if (std::isinf(x) | std::isinf(y) | std::isinf(z))", for example. This is intentional. The problem is that gcc almost always produces branch code for logical operations, so *a lot* of conditional jumps. By using arithmetic operations, so instead of || && just | &, I can get it to generate only actually necessary conditional jumps or cmoves. branchfree code is always better. template constexpr T hypot3_exp(T x, T y, T z) noexcept { using limits = std::numeric_limits; constexpr T zero = 0; x = std::abs(x); y = std::abs(y); z = std::abs(z); if (std::isinf(x) | std::isinf(y) | std::isinf(z)) [[unlikely]] return limits::infinity(); if (std::isnan(x) | std::isnan(y) | std::isnan(z)) [[unlikely]] return limits::quiet_NaN(); if ((x==zero) & (y==zero) & (z==zero)) [[unlikely]] return zero; if ((y==zero) & (z==zero)) [[unlikely]] return x; if ((x==zero) & (z==zero)) [[unlikely]] return y; if ((x==zero) & (y==zero)) [[unlikely]] return z; auto sort = [](T& a, T& b, T& c)constexpr noexcept -> void { if (a > b) std::swap(a, b); if (b > c) std::swap(b, c); if (a > b) std::swap(a, b); }; sort(x, y, z); // x <= y <= z int exp = 0; z = std::frexp(z, &exp); y = std::ldexp(y, -exp); x = std::ldexp(x, -exp); T sum = x*x + y*y; sum += z*z; return std::ldexp(std::sqrt(sum), exp); } template constexpr T hypot3_scale(T x, T y, T z) noexcept { using limits = std::numeric_limits; auto prev_power2 = [](const T value)constexpr noexcept -> T { return std::exp2(std::floor(std::log2(value))); }; constexpr T sqrtmax = std::sqrt(limits::max()), scale_up= prev_power2(sqrtmax), scale_down = T(1) / scale_up, zero= 0; x = std::abs(x); y = std::abs(y); z = std::abs(z); if (std::isinf(x) | std::isinf(y) | std::isinf(z)) [[unlikely]] return limits::infinity(); if (std::isnan(x) | std::isnan(y) | std::isnan(z)) [[unlikely]] return limits::quiet_NaN(); if ((x==zero) & (y==zero) & (z==zero)) [[unlikely]] return zero; if ((y==zero) & (z==zero)) [[unlikely]] return x; if ((x==zero) & (z==zero)) [[unlikely]] return y; if ((x==zero) & (y==zero)) [[unlikely]] return z; auto sort = [](T& a, T& b, T& c)constexpr noexcept -> void { if (a > b) std::swap(a, b); if (b > c) std::swap(b, c); if (a > b) std::swap(a, b); }; sort(x, y, z); // x <= y <= z const T scale = (z > sqrtmax) ? scale_down : (z < 1) ? scale_up : 1; x *= scale; y *= scale; z *= scale; T sum = x*x + y*y; sum += z*z; return std::sqrt(sum) / scale; } regards Gero
[Bug libstdc++/77776] C++17 std::hypot implementation is poor
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6 --- Comment #18 from Jakub Jelinek --- I was looking at the sysdeps/ieee754/ldbl-128/ version, i.e. what is used for hypotf128.
[Bug libstdc++/77776] C++17 std::hypot implementation is poor
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6 --- Comment #17 from Matthias Kretz (Vir) --- hypotf(a, b) is implemented using double precision and hypot(a, b) uses 80-bit long double on i386 and x86_64 hypot does what you describe, right? std::experimental::simd benchmarks of hypot(a, b), where simd_abi::scalar uses the implementation (i.e. glibc): -march=skylake-avx512 -ffast-math -O3 -lmvec: TYPE Latency Speedup Throughput Speedup [cycles/call] [per value] [cycles/call] [per value] float, simd_abi::scalar 37.5 1 11.5 1 float,37.6 0.999 10.2 1.13 float, simd_abi::__sse 344.42 6.46 7.15 float, simd_abi::__avx34.18.79 6.56 14.1 float, simd_abi::_Avx512<32> 34.38.76 6.01 15.4 float, simd_abi::_Avx512<64> 44.113.6 12 15.4 float, [[gnu::vector_size(16)]] 58.32.57 47.5 0.974 float, [[gnu::vector_size(32)]]1322.27104 0.892 float, [[gnu::vector_size(64)]]240 2.5222 0.832 -- TYPE Latency Speedup Throughput Speedup [cycles/call] [per value] [cycles/call] [per value] double, simd_abi::scalar 81 1 21.5 1 double,80.11.01 21.3 1.01 double, simd_abi::__sse39.94.06 6.47 6.64 double, simd_abi::__avx40.28.05 12 7.14 double, simd_abi::_Avx512<32> 40.38.04 12 7.14 double, simd_abi::_Avx512<64> 56.211.5 24 7.14 double, [[gnu::vector_size(16)]] 89.31.81 42.5 1.01 double, [[gnu::vector_size(32)]]1502.16110 0.777 double, [[gnu::vector_size(64)]]2972.18242 0.71 -- -march=skylake-avx512 -O3 -lmvec: TYPE Latency Speedup Throughput Speedup [cycles/call] [per value] [cycles/call] [per value] float, simd_abi::scalar 37.6 1 10.4 1 float,37.7 0.998 10.2 1.02 float, simd_abi::__sse37.6 4 8.83 4.71 float, simd_abi::__avx37.58.01 9.42 8.82 float, simd_abi::_Avx512<64> 47.812.6 12 13.8 float, [[gnu::vector_size(16)]] 98.71.52 57.2 0.727 float, [[gnu::vector_size(32)]]151 2114 0.728 float, [[gnu::vector_size(64)]]2602.31230 0.722 -- TYPE Latency Speedup Throughput Speedup [cycles/call] [per value] [cycles/call] [per value] double, simd_abi::scalar 79.7 1 21.7 1 double,80.1 0.995 21.6 1 double, simd_abi::__sse44.2 3.6 9.99 4.33 double, simd_abi::__avx43.67.32 12 7.21 double, simd_abi::_Avx512<64> 59.910.6 24 7.21 double, [[gnu::vector_size(16)]] 88.3 1.8 44.2 0.98 double, [[gnu::vector_size(32)]]1631.96115 0.75 double, [[gnu::vector_size(64)]]3022.11233 0.742 -- I have never ported my SIMD implementation back to scalar and benchmarked it against glibc.
[Bug libstdc++/77776] C++17 std::hypot implementation is poor
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6 Jakub Jelinek changed: What|Removed |Added CC||jakub at gcc dot gnu.org --- Comment #16 from Jakub Jelinek --- (In reply to Matthias Kretz (Vir) from comment #15) > Your implementation still needs to solve: > > 1. Loss of precision because of division & subsequent scaling by max. Users > comparing std::hypot(x, y, z) against a simple std::sqrt(x * x + y * y + z * > z) might wonder why they want to use std::hypot if it's less precise. > > 2. Relatively high cost (in latency and throughput) because of the three > divisions. You could replace it with scale = 1/max; x *= scale; ... But that > can reduce precision even further. > > 3. Summation of the x, y, and z squares isn't associative if you care about > precision. A high quality implementation needs to add the two lowest values > first. > > Here's a precise but inefficient implementation: > (https://compiler-explorer.com/z/ocGPnsYE3) > > template > [[gnu::optimize("-fno-unsafe-math-optimizations")]] > T > hypot3(T x, T y, T z) > { > x = std::abs(x); > y = std::abs(y); > z = std::abs(z); > if (std::isinf(x) || std::isinf(y) || std::isinf(z)) > return std::__infinity_v; > else if (std::isnan(x) || std::isnan(y) || std::isnan(z)) > return std::__quiet_NaN_v; > else if (x == y && y == z) > return x * std::sqrt(T(3)); > else if (z == 0 && y == 0) > return x; > else if (x == 0 && z == 0) > return y; > else if (x == 0 && y == 0) > return z; > else > { > T hi = std::max(std::max(x, y), z); > T lo0 = std::min(std::max(x, y), z); > T lo1 = std::min(x, y); > int e = 0; > hi = std::frexp(hi, &e); > lo0 = std::ldexp(lo0, -e); > lo1 = std::ldexp(lo1, -e); > T lo = lo0 * lo0 + lo1 * lo1; > return std::ldexp(std::sqrt(hi * hi + lo), e); > } > } > > AFAIK > https://gcc.gnu.org/git/?p=gcc.git;a=blob;f=libstdc%2B%2B-v3/include/ > experimental/bits/simd_math.h;h=06e7b4496f9917f886f66fbd7629700dd17e55f9; > hb=HEAD#l1168 is a precise and efficient implementation. It also avoids > division altogether unless an input is subnormal. What glibc does there for the 2 argument hypot is after handling the non-finite cases finds the minimum and maximum and uses just normal multiplication, addition + sqrt for the common case where maximum isn't too large and minimum isn't too small. So, no need to use frexp/ldexp, just comparisons of hi above against sqrt of (max finite / 3), in that case scale by multiplying all 3 args by some appropriate scale constant, and similarly otherwise if lo1 is too small by some large scale.
[Bug libstdc++/77776] C++17 std::hypot implementation is poor
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6 --- Comment #15 from Matthias Kretz (Vir) --- Your implementation still needs to solve: 1. Loss of precision because of division & subsequent scaling by max. Users comparing std::hypot(x, y, z) against a simple std::sqrt(x * x + y * y + z * z) might wonder why they want to use std::hypot if it's less precise. 2. Relatively high cost (in latency and throughput) because of the three divisions. You could replace it with scale = 1/max; x *= scale; ... But that can reduce precision even further. 3. Summation of the x, y, and z squares isn't associative if you care about precision. A high quality implementation needs to add the two lowest values first. Here's a precise but inefficient implementation: (https://compiler-explorer.com/z/ocGPnsYE3) template [[gnu::optimize("-fno-unsafe-math-optimizations")]] T hypot3(T x, T y, T z) { x = std::abs(x); y = std::abs(y); z = std::abs(z); if (std::isinf(x) || std::isinf(y) || std::isinf(z)) return std::__infinity_v; else if (std::isnan(x) || std::isnan(y) || std::isnan(z)) return std::__quiet_NaN_v; else if (x == y && y == z) return x * std::sqrt(T(3)); else if (z == 0 && y == 0) return x; else if (x == 0 && z == 0) return y; else if (x == 0 && y == 0) return z; else { T hi = std::max(std::max(x, y), z); T lo0 = std::min(std::max(x, y), z); T lo1 = std::min(x, y); int e = 0; hi = std::frexp(hi, &e); lo0 = std::ldexp(lo0, -e); lo1 = std::ldexp(lo1, -e); T lo = lo0 * lo0 + lo1 * lo1; return std::ldexp(std::sqrt(hi * hi + lo), e); } } AFAIK https://gcc.gnu.org/git/?p=gcc.git;a=blob;f=libstdc%2B%2B-v3/include/experimental/bits/simd_math.h;h=06e7b4496f9917f886f66fbd7629700dd17e55f9;hb=HEAD#l1168 is a precise and efficient implementation. It also avoids division altogether unless an input is subnormal.
[Bug libstdc++/77776] C++17 std::hypot implementation is poor
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6 Jiang An changed: What|Removed |Added CC||de34 at live dot cn --- Comment #14 from Jiang An --- (In reply to g.peterhoff from comment #11) > constexpr > * The hypot3 functions can thus be defined as _GLIBCXX_CONSTEXPR. I think it should be marked _GLIBCXX26_CONSTEXPR. (and we may also need to change bits/c++config)
[Bug libstdc++/77776] C++17 std::hypot implementation is poor
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6 --- Comment #13 from g.peterh...@t-online.de --- Thanks for the suggestions: template constexpr _Tp __hypot3(_Tp __x, _Tp __y, _Tp __z) noexcept { if (std::isinf(__x) | std::isinf(__y) | std::isinf(__z)) [[__unlikely__]] return _Tp(INFINITY); __x = std::fabs(__x); __y = std::fabs(__y); __z = std::fabs(__z); const _Tp __max = std::fmax(std::fmax(__x, __y), __z); if (__max == _Tp{}) [[__unlikely__]] return __max; __x /= __max; __y /= __max; __z /= __max; return std::sqrt(__x*__x + __y*__y + __z*__z) * __max; } The functions are then set to constexpr/noexcept. regards Gero
[Bug libstdc++/77776] C++17 std::hypot implementation is poor
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6 --- Comment #12 from Jonathan Wakely --- (In reply to g.peterhoff from comment #11) > Would this be a good implementation for hypot3 in cmath? Thanks for the suggestion! > #define GCC_UNLIKELY(x) __builtin_expect(x, 0) > #define GCC_LIKELY(x) __builtin_expect(x, 1) > > namespace __detail > { > template > inline _GLIBCXX_CONSTEXPR typename > enable_if::value, > bool>::type __isinf3(const _Tp __x, const _Tp __y, const _Tp __z) noexcept > { > return bool(int(std::isinf(__x)) | int(std::isinf(__y)) | > int(std::isinf(__z))); The casts are redundant and just make it harder to read IMHO: return std::isinf(__x) | std::isinf(__y) | std::isinf(__z); > } > > template > inline _GLIBCXX_CONSTEXPR typename > enable_if::value, > _Tp>::type__hypot3(_Tp __x, _Tp __y, _Tp __z) noexcept > { > __x = std::fabs(__x); > __y = std::fabs(__y); > __z = std::fabs(__z); > > const _Tp > __max = std::fmax(std::fmax(__x, __y), __z); > > if (GCC_UNLIKELY(__max == _Tp{})) > { > return __max; > } > else > { > __x /= __max; > __y /= __max; > __z /= __max; > return std::sqrt(__x*__x + __y*__y + __z*__z) * __max; > } > } > } // __detail > > > template > inline _GLIBCXX_CONSTEXPR typename > enable_if::value, > _Tp>::type__hypot3(const _Tp __x, const _Tp __y, const _Tp __z) noexcept This is a C++17 function, you can use enable_if_t>, but see below. > { > return (GCC_UNLIKELY(__detail::__isinf3(__x, __y, __z))) ? > numeric_limits<_Tp>::infinity() : __detail::__hypot3(__x, __y, __z); > } > > #undef GCC_UNLIKELY > #undef GCC_LIKELY > > How does it work? > * Basically, I first pull out the special case INFINITY (see > https://en.cppreference.com/w/cpp/numeric/math/hypot). > * As an additional safety measure (to prevent misuse) the functions are > defined by enable_if. I don't think we want to slow down compilation like that. If users decide to misuse std::__detail::__isinf3 then they get what they deserve. > constexpr > * The hypot3 functions can thus be defined as _GLIBCXX_CONSTEXPR. Just use 'constexpr' because this function isn't compiled as C++98. Then you don't need the 'inline'. Although the standard doesn't allow std::hypot3 to be constexpr. > Questions > * To get a better runtime behavior I define GCC_(UN)LIKELY. Are there > already such macros (which I have overlooked)? No, but you can do: if (__isinf3(__x, __y, __x)) [[__unlikely__]] ... > * The functions are noexcept. Does that make sense? If yes: why are the math > functions not noexcept? I think it's just because nobody bothered to add them, and I doubt much code ever needs to check whether they are noexcept. The compiler should already know the standard libm functions don't throw. For this function (which isn't in libm and the compiler doesn't know about) it seems worth adding 'noexcept'. Does splitting it into three functions matter? It seems simpler as a single function: template constexpr _Tp __hypot3(_Tp __x, _Tp __y, _Tp __z) noexcept { if (std::isinf(__x) | std::isinf(__y) | std::isinf(__z)) [[__unlikely__]] return numeric_limits<_Tp>::infinity(); __x = std::fabs(__x); __y = std::fabs(__y); __z = std::fabs(__z); const _Tp __max = std::fmax(std::fmax(__x, __y), __z); if (__max == _Tp{}) [[__unlikely__]] return __max; else { __x /= __max; __y /= __max; __z /= __max; return std::sqrt(__x*__x + __y*__y + __z*__z) * __max; } } This would add a dependency on to , which isn't currently there. Maybe we could just return (_Tp)__builtin_huge_vall().
[Bug libstdc++/77776] C++17 std::hypot implementation is poor
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6 g.peterh...@t-online.de changed: What|Removed |Added CC||g.peterh...@t-online.de --- Comment #11 from g.peterh...@t-online.de --- Would this be a good implementation for hypot3 in cmath? #define GCC_UNLIKELY(x) __builtin_expect(x, 0) #define GCC_LIKELY(x) __builtin_expect(x, 1) namespace __detail { template inline _GLIBCXX_CONSTEXPR typename enable_if::value, bool>::type __isinf3(const _Tp __x, const _Tp __y, const _Tp __z) noexcept { return bool(int(std::isinf(__x)) | int(std::isinf(__y)) | int(std::isinf(__z))); } template inline _GLIBCXX_CONSTEXPR typename enable_if::value, _Tp>::type __hypot3(_Tp __x, _Tp __y, _Tp __z) noexcept { __x = std::fabs(__x); __y = std::fabs(__y); __z = std::fabs(__z); const _Tp __max = std::fmax(std::fmax(__x, __y), __z); if (GCC_UNLIKELY(__max == _Tp{})) { return __max; } else { __x /= __max; __y /= __max; __z /= __max; return std::sqrt(__x*__x + __y*__y + __z*__z) * __max; } } } // __detail template inline _GLIBCXX_CONSTEXPR typename enable_if::value, _Tp>::type __hypot3(const _Tp __x, const _Tp __y, const _Tp __z) noexcept { return (GCC_UNLIKELY(__detail::__isinf3(__x, __y, __z))) ? numeric_limits<_Tp>::infinity() : __detail::__hypot3(__x, __y, __z); } #undef GCC_UNLIKELY #undef GCC_LIKELY How does it work? * Basically, I first pull out the special case INFINITY (see https://en.cppreference.com/w/cpp/numeric/math/hypot). * As an additional safety measure (to prevent misuse) the functions are defined by enable_if. constexpr * The hypot3 functions can thus be defined as _GLIBCXX_CONSTEXPR. Questions * To get a better runtime behavior I define GCC_(UN)LIKELY. Are there already such macros (which I have overlooked)? * The functions are noexcept. Does that make sense? If yes: why are the math functions not noexcept? thx Gero
[Bug libstdc++/77776] C++17 std::hypot implementation is poor
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6 Richard Biener changed: What|Removed |Added Status|NEW |ASSIGNED
[Bug libstdc++/77776] C++17 std::hypot implementation is poor
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6 --- Comment #10 from Matthias Kretz --- Experience from testing my simd implementation: I had failures (2 ULP deviation from long double result) when using auto __xx = abs(__x); auto __yy = abs(__y); auto __zz = abs(__z); auto __hi = max(max(__xx, __yy), __zz); auto __l0 = min(__zz, max(__xx, __yy)); auto __l1 = min(__yy, __xx); __l0 /= __hi; __l1 /= __hi; auto __lo = __l0 * __l0 + __l1 * __l1; return __hi * sqrt(1 + __lo); Where the failures occur depends on wether FMA instructions are used. I have observed only 1 ULP deviation from long double with my algorithm (independent of FMAs). Here are two data points that seem challenging: hypot(0x1.965372p+125f, 0x1.795c92p+126f, 0x1.d0fc96p+125f) -> 0x1.e79366p+126f hypot(0x1.235f24p+125f, 0x1.5b88f4p+125f, 0x1.d57828p+124f) -> 0x1.feaa26p+125f
[Bug libstdc++/77776] C++17 std::hypot implementation is poor
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6 --- Comment #9 from Matthias Kretz --- (In reply to emsr from comment #7) > What does this do? > > auto __hi_exp = > __hi & simd<_T, _Abi>(std::numeric_limits<_T>::infinity()); // no error component-wise bitwise and of __hi and +inf. Or in other words, it sets all sign bits and mantissa bits to 0. Consequently `__hi / __hi_exp` returns __hi with the exponent bits set to 0x3f8 (float) / 0x3ff (double) and the mantissa bits unchanged. > Sorry, I have no simd knowlege yet. It's a very simple idea: - simd holds simd::size() many values of type T. - All operators/operations act component-wise. See Section 9 in wg21.link/N4793 for the last WD of the TS. > Anyway, doesn't the large scale risk overflow if a, b are large? I guess I'm > lost. It basically has the same underflow risks as your implementation does, and no risk of overflow.
[Bug libstdc++/77776] C++17 std::hypot implementation is poor
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6 --- Comment #8 from emsr at gcc dot gnu.org --- (In reply to Matthias Kretz from comment #6) > > How precise is hypot supposed to be? I know it is supposed to try and avoid > > spurious overflow/underflow, but I am not convinced that it should aim for > > correct rounding. > > That's a good question for all of / . Any normative wording > on that question would be (welcome) news to me. AFAIK precision is left > completely as QoI. So, except for the Annex F requirements (which we can > drop with -ffast-math), let's implement all of as `return 0;`. ;-) > It looks like C is trying to incorporate ISO/IEC TS 18661-1 Floating-point extensions for C — Part 1: Binary floating-point arithmetic, etc. This adds a lot of rounding control but it looks like correcly rounded transcendentals are still merely a recommendation.
[Bug libstdc++/77776] C++17 std::hypot implementation is poor
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6 --- Comment #7 from emsr at gcc dot gnu.org --- What does this do? auto __hi_exp = __hi & simd<_T, _Abi>(std::numeric_limits<_T>::infinity()); // no error Sorry, I have no simd knowlege yet. Anyway, doesn't the large scale risk overflow if a, b are large? I guess I'm lost.
[Bug libstdc++/77776] C++17 std::hypot implementation is poor
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6 --- Comment #6 from Matthias Kretz --- (In reply to Marc Glisse from comment #4) > Your "reference" number seems strange. Why not do the computation with > double (or long double or mpfr) or use __builtin_hypotf? Note that it > changes the value. Doh. (I didn't know the builtin exists. But use of (long) double should have been a no-brainer.) I guess my point was the precision of the input to sqrt not the result of sqrt. The sqrt makes that error almost irrelevant, though. My numerical analysis skills are not good enough to argue for what approach is better. But intuitively, keeping the information of the `amax` mantissa for the final multiplication around might actually make that approach slightly better (if the input to the sqrt were precise that wouldn't be true, though - but it never is). > How precise is hypot supposed to be? I know it is supposed to try and avoid > spurious overflow/underflow, but I am not convinced that it should aim for > correct rounding. That's a good question for all of / . Any normative wording on that question would be (welcome) news to me. AFAIK precision is left completely as QoI. So, except for the Annex F requirements (which we can drop with -ffast-math), let's implement all of as `return 0;`. ;-) > (I see that you are using clang in that godbolt link, with gcc I need to > mark the global variables with "extern const" to get a similar asm) Thanks for the hint. I switched to clang when GCC started to produce code instead of constants in the asm. (I also like the unicode identifier support in clang ;-))
[Bug libstdc++/77776] C++17 std::hypot implementation is poor
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6 --- Comment #5 from emsr at gcc dot gnu.org --- Right. fma is irrelevant. I will wind up with sqrt(1 + __lo). I won't hope that max * __scale == 1 here but just add 1. And why waste the partial sort? New patch tomorrow a.m. (I guess I'm too late though.)
[Bug libstdc++/77776] C++17 std::hypot implementation is poor
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6 --- Comment #4 from Marc Glisse --- (In reply to Matthias Kretz from comment #3) > Did you consider the error introduced by scaling with __amax? I made sure > that the division is without error by zeroing the mantissa bits. Here's a > motivating example that shows an error of 1 ulp otherwise: > https://godbolt.org/z/_U2K7e Your "reference" number seems strange. Why not do the computation with double (or long double or mpfr) or use __builtin_hypotf? Note that it changes the value. How precise is hypot supposed to be? I know it is supposed to try and avoid spurious overflow/underflow, but I am not convinced that it should aim for correct rounding. (I see that you are using clang in that godbolt link, with gcc I need to mark the global variables with "extern const" to get a similar asm) > About std::fma, how bad is the performance hit if there's no instruction for > it? FMA doesn't seem particularly relevant here.
[Bug libstdc++/77776] C++17 std::hypot implementation is poor
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6 --- Comment #3 from Matthias Kretz --- Did you consider the error introduced by scaling with __amax? I made sure that the division is without error by zeroing the mantissa bits. Here's a motivating example that shows an error of 1 ulp otherwise: https://godbolt.org/z/_U2K7e About std::fma, how bad is the performance hit if there's no instruction for it?
[Bug libstdc++/77776] C++17 std::hypot implementation is poor
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6 --- Comment #2 from emsr at gcc dot gnu.org --- I have this in another tree which solves the inf issue: namespace __detail { // Avoid including all of template constexpr _Tp __fmax3(_Tp __x, _Tp __y, _Tp __z) { return std::fmax(std::fmax(__x, __y), std::fmax(__y, __z)); } template constexpr _Tp __hypot3(_Tp __x, _Tp __y, _Tp __z) { if (std::isnan(__x) || std::isnan(__y) || std::isnan(__z)) return std::numeric_limits<_Tp>::quiet_NaN(); else { __x = std::abs(__x); __y = std::abs(__y); __z = std::abs(__z); const auto __amax = __fmax3(__x, __y, __z); if (__amax == _Tp{0}) return _Tp{0}; else if (std::__detail::__isinf(__amax)) return std::numeric_limits<_Tp>::infinity(); else { __x /= __amax; __y /= __amax; __z /= __amax; return __amax * std::sqrt(__x * __x + __y * __y + __z * __z); } } } } I get your point about scaling and adding smallest first. I have access to std::fma(). For me I think this means: /* const auto __amax = max(max(__x, __y), __z); const auto __l0 = min(__z, max(__x, __y)); const auto __l1 = min(__y, __x); const auto __scale = 1 / __amax; __l0 *= __scale; __l1 *= __scale; // Add the two smaller values first. const auto __lo = __l0 * __l0 + __l1 * __l1; const auto __r = __amax * sqrt(fma(__h1, __h1, __lo)); */
[Bug libstdc++/77776] C++17 std::hypot implementation is poor
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6 Matthias Kretz changed: What|Removed |Added CC||kretz at kde dot org --- Comment #1 from Matthias Kretz --- Testcase for the inf input case (https://wandbox.org/permlink/wDla0jQOtKFUsHNs): constexpr float inf = std::numeric_limits::infinity(); std::cout << std::hypot(inf, 1.f, 1.f) << '\n'; std::cout << std::hypot(inf, inf, 1.f) << '\n'; std::cout << std::hypot(inf, inf, inf) << '\n'; All of these return NaN, but surely should return inf instead. Joseph mentions this issue in his mails. here's what I currently have for std::experimental::simd's hypot overload: template simd<_T, _Abi> hypot(simd<_T, _Abi> __x, simd<_T, _Abi> __y, simd<_T, _Abi> __z) { using namespace __proposed::float_bitwise_operators; __x = abs(__x);// no error __y = abs(__y);// no error __z = abs(__z);// no error auto __hi = max(max(__x, __y), __z); // no error auto __l0 = min(__z, max(__x, __y)); // no error auto __l1 = min(__y, __x); // no error auto __hi_exp = __hi & simd<_T, _Abi>(std::numeric_limits<_T>::infinity()); // no error where(__hi_exp == 0, __hi_exp) = std::numeric_limits<_T>::min(); // scale __hi to 0x1.???p0 and __lo by the same factor auto __scale = 1 / __hi_exp; // no error auto __h1 = __hi / __hi_exp; // no error __l0 *= 1 / __hi_exp; // no error __l1 *= 1 / __hi_exp; // no error auto __lo = __l0 * __l0 + __l1 * __l1; // add the two smaller values first auto __r = __hi_exp * sqrt(__lo + __h1 * __h1); where(__l0 + __l1 == 0, __r) = __hi; where(isinf(__x) || isinf(__y) || isinf(__z), __r) = std::numeric_limits<_T>::infinity(); return __r; } IIUC this might still produce underflow exceptions, even though the answer is precise. And without FMA in the accumulation inside the sqrt there may be a larger than .5 ULP error on the result. But that's just to the best of my knowledge.
[Bug libstdc++/77776] C++17 std::hypot implementation is poor
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6 Jonathan Wakely changed: What|Removed |Added Status|UNCONFIRMED |NEW Last reconfirmed||2017-12-12 Ever confirmed|0 |1