[Bug libstdc++/77776] C++17 std::hypot implementation is poor

2024-04-10 Thread g.peterhoff--- via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6

--- Comment #29 from g.peterh...@t-online.de ---
(In reply to Jakub Jelinek from comment #28)
> As long as the scale is a power of two or 1.0 / power of two, I don't see
> why any version wouldn't be inaccurate.

Yes, but the constant scale_up is incorrectly selected.
scale_up = std::exp2(Type(limits::max_exponent-1)) --> ok
scale_up = std::exp2(Type(limits::max_exponent/2)) --> error
scale_up = prev_power2(sqrt_max) --> error

scale_down = std::exp2(Type(limits::min_exponent-1))
also seems to me to be more favorable.

PS:
There seems to be a problem with random numbers and std::float16_t, which is
why I use std::uniform_real_distribution. I have not yet found
out exactly where the error lies.

thx
Gero


template 
inline constexpr Type   hypot_exp(Type x, Type y, Type z)   noexcept
{
using limits = std::numeric_limits;

constexpr Type
zero = 0;

x = std::abs(x);
y = std::abs(y);
z = std::abs(z);

if (!(std::isnormal(x) && std::isnormal(y) && std::isnormal(z)))
[[unlikely]]
{
if  (std::isinf(x) | std::isinf(y) | std::isinf(z))
return limits::infinity();
else if (std::isnan(x) | std::isnan(y) | std::isnan(z)) return
limits::quiet_NaN();
else
{
const bool
xz{x == zero},
yz{y == zero},
zz{z == zero};

if (xz)
{
if (yz) return zz ? zero : z;
else if (zz)return y;
}
else if (yz && zz)  return x;
}
}

if (x > z) std::swap(x, z);
if (y > z) std::swap(y, z);

int
exp;

z = std::frexp(z, &exp);
y = std::ldexp(y, -exp);
x = std::ldexp(x, -exp);
return std::ldexp(std::sqrt(__builtin_assoc_barrier(x*x + y*y) + z*z),
exp);
}

template 
inline constexpr Type   hypot_gp(Type x, Type y, Type z)noexcept
{
using limits = std::numeric_limits;

constexpr Type
sqrt_min= std::sqrt(limits::min()),
sqrt_max= std::sqrt(limits::max()),
scale_up= std::exp2(Type(limits::max_exponent-1)),
scale_down  = std::exp2(Type(limits::min_exponent-1)),
zero= 0;

x = std::abs(x);
y = std::abs(y);
z = std::abs(z);

if (!(std::isnormal(x) && std::isnormal(y) && std::isnormal(z)))
[[unlikely]]
{
if  (std::isinf(x) | std::isinf(y) | std::isinf(z))
return limits::infinity();
else if (std::isnan(x) | std::isnan(y) | std::isnan(z)) return
limits::quiet_NaN();
else
{
const bool
xz{x == zero},
yz{y == zero},
zz{z == zero};

if (xz)
{
if (yz) return zz ? zero : z;
else if (zz)return y;
}
else if (yz && zz)  return x;
}
}

if (x > z) std::swap(x, z);
if (y > z) std::swap(y, z);

if (const bool b{z>=sqrt_min}; b && z<=sqrt_max) [[likely]]
{
//  no scale
return std::sqrt(__builtin_assoc_barrier(x*x + y*y) + z*z);
}
else
{
const Type
scale = b ? scale_down : scale_up;

x *= scale;
y *= scale;
z *= scale;
return std::sqrt(__builtin_assoc_barrier(x*x + y*y) + z*z) /
scale;
}
}

template 
voidtest(const size_t count, const Type min, const Type max, const Type
factor)
{
std::random_device rd{};
std::mt19937 gen{rd()};
std::uniform_real_distribution dis{min, max};

auto rnd = [&]() noexcept -> Type { return Type(dis(gen) * factor); };

for (size_t i=0; i;

test(1024*1024, 0.5, 1, limits::max());
test(1024*1024, 0, 1, limits::min());

return EXIT_SUCCESS;
}

[Bug libstdc++/77776] C++17 std::hypot implementation is poor

2024-04-10 Thread jakub at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6

--- Comment #28 from Jakub Jelinek  ---
As long as the scale is a power of two or 1.0 / power of two, I don't see why
any version wouldn't be inaccurate.

[Bug libstdc++/77776] C++17 std::hypot implementation is poor

2024-04-10 Thread g.peterhoff--- via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6

--- Comment #27 from g.peterh...@t-online.de ---
Hi Matthias,
thanks for your benchmark. I still have 2 questions:

1) Accuracy
The frexp/ldexp variant seems to be the most accurate; is that correct? Then
other constants would have to be used in hypot_gp:
scale_up   = std::exp2(Type(limits::max_exponent-1))
scale_down = std::exp2(Type(limits::min_exponent-1))

2) Speed
Your benchmark outputs several columns (Δ)Latency/(Δ)Throughput/Speedup. What
exactly do the values stand for; what should be optimized for?

thx
Gero

[Bug libstdc++/77776] C++17 std::hypot implementation is poor

2024-04-04 Thread g.peterhoff--- via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6

--- Comment #26 from g.peterh...@t-online.de ---
must of course be "... / scale".
How can I still edit posts?

[Bug libstdc++/77776] C++17 std::hypot implementation is poor

2024-04-04 Thread g.peterhoff--- via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6

--- Comment #25 from g.peterh...@t-online.de ---
Hi Matthias,
to get good results on average (all FP-types: (B)FP16..FP128,
scalar/vectorized(SIMD)/parallel/...) this algorithm seems to me (so far) to be
suitable:

template 
inline constexpr Type   hypot_gp(Type x, Type y, Type z)noexcept
{
using limits = std::numeric_limits;

constexpr Type
sqrt_min= std::sqrt(limits::min()),
sqrt_max= std::sqrt(limits::max()),
scale_up= std::exp2( Type(limits::max_exponent/2)),
scale_down  = std::exp2(-Type(limits::max_exponent/2)),
zero= 0;

x = std::abs(x);
y = std::abs(y);
z = std::abs(z);

if (!(std::isnormal(x) && std::isnormal(y) && std::isnormal(z)))
[[unlikely]]
{
if (std::isinf(x) | std::isinf(y) | std::isinf(z))  return
limits::infinity();
else if (std::isnan(x) | std::isnan(y) | std::isnan(z)) return
limits::quiet_NaN();
else
{
const bool
xz{x == zero},
yz{y == zero},
zz{z == zero};

if (xz)
{
if (yz) return zz ? zero : z;
else if (zz)return y;
}
else if (yz && zz)  return x;
}
}

if (x > z) std::swap(x, z);
if (y > z) std::swap(y, z);

if ((z >= sqrt_min) && (z <= sqrt_max)) [[likely]]
{
//  no scale
return std::sqrt(__builtin_assoc_barrier(x*x + y*y) + z*z);
}
else
{
const Type
scale = (z >= sqrt_min) ? scale_down : scale_up;

x *= scale;
y *= scale;
z *= scale;
return std::sqrt(__builtin_assoc_barrier(x*x + y*y) + z*z);
}
}

[Bug libstdc++/77776] C++17 std::hypot implementation is poor

2024-03-25 Thread mkretz at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6

--- Comment #24 from Matthias Kretz (Vir)  ---
(In reply to g.peterhoff from comment #23)
> * How do you create the benchmarks?

https://github.com/mattkretz/simd-benchmarks
Look at hypot3.cpp :)

[Bug libstdc++/77776] C++17 std::hypot implementation is poor

2024-03-18 Thread g.peterhoff--- via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6

--- Comment #23 from g.peterh...@t-online.de ---
Hello Matthias,
you've given me new ideas. I think we agree on implementing hypot3 using a
scaling factor. But the correct value is not yet implemented here either; do
you have a suggestion?
A version here: https://godbolt.org/z/Gd53cG9YG
I've intentionally broken hypot_gp into small pieces so that you can play
around with it. This is of course unnecessary for a final version.

General
* The function must of course work efficiently with all FP types.

Questions
* Sorting: It is theoretically sufficient to sort the values x,y,z only to the
extent that the condition x,y <= z is fulfilled (HYPOT_SORT_FULL).
* Accuracy: This is better with fma (HYPOT_FMA).
* How do you create the benchmarks? I could do this myself without getting on
your nerves.

thx
Gero

[Bug libstdc++/77776] C++17 std::hypot implementation is poor

2024-03-12 Thread mkretz at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6

--- Comment #22 from Matthias Kretz (Vir)  ---
I took your hypot3_scale and reduced latency and throughput. I don't think the
sqrtmax/sqrtmin limits are correct (sqrtmax² * 3 -> infinity).

  TYPE  Latency Speedup Throughput
Speedup 
  [cycles/call] [per value]  [cycles/call] [per
value]  
 float,46.5   1   12.7 
 1  
 float, hypot3_scale   35.51.31 27 
  0.47  
 float, hypot3_mkretz  30.21.54 12 
  1.06  
--
 
  TYPE  Latency Speedup Throughput
Speedup 
  [cycles/call] [per value]  [cycles/call] [per
value]  
double,59.6   1 60 
 1  
double, hypot3_scale   51.21.16   48.8 
  1.23  
double, hypot3_mkretz  40.11.49 35 
  1.71


template 
  constexpr T 
  hypot(T x, T y, T z)
  { 
using limits = std::numeric_limits;

auto prev_power2 = [](const T value) constexpr noexcept -> T 
{ 
  return std::exp2(std::floor(std::log2(value)));
};

constexpr T sqrtmax = std::sqrt(limits::max());
constexpr T sqrtmin = std::sqrt(limits::min());
constexpr T scale_up = prev_power2(sqrtmax);
constexpr T scale_down = T(1) / scale_up;
constexpr T zero = 0;

if (not (std::isnormal(x) && std::isnormal(y) && std::isnormal(z)))
[[unlikely]]
  { 
if (std::isinf(x) | std::isinf(y) | std::isinf(z))
  return limits::infinity();
else if (std::isnan(x) | std::isnan(y) | std::isnan(z))
  return limits::quiet_NaN();
const bool xz = x == zero;
const bool yz = y == zero;
const bool zz = z == zero;
if (xz)
  { 
if (yz)
  return zz ? zero : z;
else if (zz)
  return y;
  } 
else if (yz && zz)
  return x;
  } 

x = std::abs(x);
y = std::abs(y);
z = std::abs(z);

T a = std::max(std::max(x, y), z);
T b = std::min(std::max(x, y), z);
T c = std::min(x, y);

if (a >= sqrtmin && a <= sqrtmax) [[likely]]
  return std::sqrt(__builtin_assoc_barrier(c * c + b * b) + a * a);

const T scale = a >= sqrtmin ? scale_down : scale_up;

a *= scale;
b *= scale;
c *= scale;

return std::sqrt(__builtin_assoc_barrier(c * c + b * b) + a * a) / scale;
  }

[Bug libstdc++/77776] C++17 std::hypot implementation is poor

2024-03-06 Thread redi at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6

--- Comment #21 from Jonathan Wakely  ---
(In reply to g.peterhoff from comment #19)
> * You were probably wondering why I wrote "if (std::isinf(x) | std::isinf(y)
> | std::isinf(z))", for example. This is intentional. The problem is that gcc
> almost always produces branch code for logical operations, so *a lot* of
> conditional jumps. By using arithmetic operations, so instead of || && just
> | &, I can get it to generate only actually necessary conditional jumps or
> cmoves. branchfree code is always better.

In general the code is not equivalent if any of the subexpressions has side
effects, such as setting errno, raising an exception for a signalling nan etc.
but I don't think we care about that here.

[Bug libstdc++/77776] C++17 std::hypot implementation is poor

2024-03-06 Thread mkretz at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6

--- Comment #20 from Matthias Kretz (Vir)  ---
Thanks, I'd be very happy if such a relatively clear implementation could make
it!

> branchfree code is always better.

Don't say it like that. Smart branching, making use of how static
branch-prediction works, can speed up code significantly. You don't want to
compute everything when 99.9% of the inputs need only a fraction of the work.

  TYPE  Latency Speedup Throughput
Speedup
  [cycles/call] [per value]  [cycles/call] [per
value]
 float, simd_abi::scalar   48.1   1 17 
 1
 float, std::hypot 43.31.11   12.3 
  1.39
 float, hypot3_scale   31.71.52   22.3 
 0.764
 float, hypot3_exp 83.9   0.574   84.5 
 0.201
--
  TYPE  Latency Speedup Throughput
Speedup
  [cycles/call] [per value]  [cycles/call] [per
value]
double, simd_abi::scalar   54.7   1 15 
 1
double, std::hypot 53.81.02 19 
  0.79
double, hypot3_scale 441.24 24 
 0.625
double, hypot3_exp 91.3   0.599 91 
 0.165

and with -ffast-math:

  TYPE  Latency Speedup Throughput
Speedup
  [cycles/call] [per value]  [cycles/call] [per
value]
 float, simd_abi::scalar   48.9   1   9.15 
 1
 float, std::hypot 53.2   0.918   8.31 
   1.1
 float, hypot3_scale   31.31.56 14 
 0.652
 float, hypot3_exp 55.9   0.874   21.5 
 0.425
--
  TYPE  Latency Speedup Throughput
Speedup
  [cycles/call] [per value]  [cycles/call] [per
value]
double, simd_abi::scalar   54.8   1   9.07 
 1
double, std::hypot 61.5   0.891   11.3 
 0.805
double, hypot3_scale   40.81.34   12.1 
 0.753
double, hypot3_exp 64.2   0.853   23.3 
  0.39


I have not tested correctness or precision yet. Also, the benchmark only uses
inputs that do not require anything else than √x²+y²+z² (which, I believe,
should be the common input and thus optimized for).

[Bug libstdc++/77776] C++17 std::hypot implementation is poor

2024-03-05 Thread g.peterhoff--- via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6

--- Comment #19 from g.peterh...@t-online.de ---
> So, no need to use frexp/ldexp, just comparisons of hi above against sqrt of
> (max finite / 3), in that case scale by multiplying all 3 args by some
> appropriate scale constant, and similarly otherwise if lo1 is too small by
> some large scale.

I don't really know. With frexp/ldexp you probably get the highest accuracy
(even if it is probably slower) instead of doing it manually. The problem is to
determine suitable scaling factors and to adjust the (return)values
accordingly. I have implemented both cases.

Error
* In the case (x==y && y==z), x*std::sqrt(T(3)) must not simply be returned, as
this can lead to an overflow (inf).

Generally
* Instead of using fmin/fmax to determine the values hi,lo1,lo0, it is better
to sort x,y,z. This is faster and clearer and no additional variables need to
be introduced.
* It also makes sense to consider the case (x==0 && y==0 && z==0).

Optimizations
* You were probably wondering why I wrote "if (std::isinf(x) | std::isinf(y) |
std::isinf(z))", for example. This is intentional. The problem is that gcc
almost always produces branch code for logical operations, so *a lot* of
conditional jumps. By using arithmetic operations, so instead of || && just |
&, I can get it to generate only actually necessary conditional jumps or
cmoves. branchfree code is always better.


template 
constexpr T hypot3_exp(T x, T y, T z) noexcept
{
using limits = std::numeric_limits;

constexpr T
zero = 0;

x = std::abs(x);
y = std::abs(y);
z = std::abs(z);

if (std::isinf(x) | std::isinf(y) | std::isinf(z))  [[unlikely]]
return limits::infinity();
if (std::isnan(x) | std::isnan(y) | std::isnan(z))  [[unlikely]]
return limits::quiet_NaN();
if ((x==zero) & (y==zero) & (z==zero))  [[unlikely]]
return zero;
if ((y==zero) & (z==zero))  [[unlikely]]
return x;
if ((x==zero) & (z==zero))  [[unlikely]]
return y;
if ((x==zero) & (y==zero))  [[unlikely]]
return z;

auto sort = [](T& a, T& b, T& c)constexpr noexcept -> void
{
if (a > b) std::swap(a, b);
if (b > c) std::swap(b, c);
if (a > b) std::swap(a, b);
};

sort(x, y, z);  //  x <= y <= z

int
exp = 0;

z = std::frexp(z, &exp);
y = std::ldexp(y, -exp);
x = std::ldexp(x, -exp);

T
sum = x*x + y*y;

sum += z*z;
return std::ldexp(std::sqrt(sum), exp);
}

template 
constexpr T hypot3_scale(T x, T y, T z) noexcept
{
using limits = std::numeric_limits;

auto prev_power2 = [](const T value)constexpr noexcept -> T
{
return std::exp2(std::floor(std::log2(value)));
};

constexpr T
sqrtmax = std::sqrt(limits::max()),
scale_up= prev_power2(sqrtmax),
scale_down  = T(1) / scale_up,
zero= 0;

x = std::abs(x);
y = std::abs(y);
z = std::abs(z);

if (std::isinf(x) | std::isinf(y) | std::isinf(z))  [[unlikely]]
return limits::infinity();
if (std::isnan(x) | std::isnan(y) | std::isnan(z))  [[unlikely]]
return limits::quiet_NaN();
if ((x==zero) & (y==zero) & (z==zero))  [[unlikely]]
return zero;
if ((y==zero) & (z==zero))  [[unlikely]]
return x;
if ((x==zero) & (z==zero))  [[unlikely]]
return y;
if ((x==zero) & (y==zero))  [[unlikely]]
return z;

auto sort = [](T& a, T& b, T& c)constexpr noexcept -> void
{
if (a > b) std::swap(a, b);
if (b > c) std::swap(b, c);
if (a > b) std::swap(a, b);
};

sort(x, y, z);  //  x <= y <= z

const T
scale = (z > sqrtmax) ? scale_down : (z < 1) ? scale_up : 1;

x *= scale;
y *= scale;
z *= scale;

T
sum = x*x + y*y;

sum += z*z;
return std::sqrt(sum) / scale;
}


regards
Gero

[Bug libstdc++/77776] C++17 std::hypot implementation is poor

2024-03-04 Thread jakub at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6

--- Comment #18 from Jakub Jelinek  ---
I was looking at the sysdeps/ieee754/ldbl-128/ version, i.e. what is used for
hypotf128.

[Bug libstdc++/77776] C++17 std::hypot implementation is poor

2024-03-04 Thread mkretz at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6

--- Comment #17 from Matthias Kretz (Vir)  ---
hypotf(a, b) is implemented using double precision and hypot(a, b) uses 80-bit
long double on i386 and x86_64 hypot does what you describe, right?

std::experimental::simd benchmarks of hypot(a, b), where simd_abi::scalar uses
the  implementation (i.e. glibc):


-march=skylake-avx512 -ffast-math -O3 -lmvec:
  TYPE  Latency Speedup Throughput
Speedup
  [cycles/call] [per value]  [cycles/call] [per
value]
 float, simd_abi::scalar   37.5   1   11.5 
 1
 float,37.6   0.999   10.2 
  1.13
 float, simd_abi::__sse  344.42   6.46 
  7.15
 float, simd_abi::__avx34.18.79   6.56 
  14.1
 float, simd_abi::_Avx512<32>  34.38.76   6.01 
  15.4
 float, simd_abi::_Avx512<64>  44.113.6 12 
  15.4
 float, [[gnu::vector_size(16)]]   58.32.57   47.5 
 0.974
 float, [[gnu::vector_size(32)]]1322.27104 
 0.892
 float, [[gnu::vector_size(64)]]240 2.5222 
 0.832
--
  TYPE  Latency Speedup Throughput
Speedup
  [cycles/call] [per value]  [cycles/call] [per
value]
double, simd_abi::scalar 81   1   21.5 
 1
double,80.11.01   21.3 
  1.01
double, simd_abi::__sse39.94.06   6.47 
  6.64
double, simd_abi::__avx40.28.05 12 
  7.14
double, simd_abi::_Avx512<32>  40.38.04 12 
  7.14
double, simd_abi::_Avx512<64>  56.211.5 24 
  7.14
double, [[gnu::vector_size(16)]]   89.31.81   42.5 
  1.01
double, [[gnu::vector_size(32)]]1502.16110 
 0.777
double, [[gnu::vector_size(64)]]2972.18242 
  0.71
--

-march=skylake-avx512 -O3 -lmvec:
  TYPE  Latency Speedup Throughput
Speedup
  [cycles/call] [per value]  [cycles/call] [per
value]
 float, simd_abi::scalar   37.6   1   10.4 
 1
 float,37.7   0.998   10.2 
  1.02  
 float, simd_abi::__sse37.6   4   8.83 
  4.71  
 float, simd_abi::__avx37.58.01   9.42 
  8.82
 float, simd_abi::_Avx512<64>  47.812.6 12 
  13.8
 float, [[gnu::vector_size(16)]]   98.71.52   57.2 
 0.727
 float, [[gnu::vector_size(32)]]151   2114 
 0.728
 float, [[gnu::vector_size(64)]]2602.31230 
 0.722
--
  TYPE  Latency Speedup Throughput
Speedup
  [cycles/call] [per value]  [cycles/call] [per
value]
double, simd_abi::scalar   79.7   1   21.7 
 1
double,80.1   0.995   21.6 
 1
double, simd_abi::__sse44.2 3.6   9.99 
  4.33
double, simd_abi::__avx43.67.32 12 
  7.21
double, simd_abi::_Avx512<64>  59.910.6 24 
  7.21
double, [[gnu::vector_size(16)]]   88.3 1.8   44.2 
  0.98
double, [[gnu::vector_size(32)]]1631.96115 
  0.75
double, [[gnu::vector_size(64)]]3022.11233 
 0.742
--

I have never ported my SIMD implementation back to scalar and benchmarked it
against glibc.

[Bug libstdc++/77776] C++17 std::hypot implementation is poor

2024-03-04 Thread jakub at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6

Jakub Jelinek  changed:

   What|Removed |Added

 CC||jakub at gcc dot gnu.org

--- Comment #16 from Jakub Jelinek  ---
(In reply to Matthias Kretz (Vir) from comment #15)
> Your implementation still needs to solve:
> 
> 1. Loss of precision because of division & subsequent scaling by max. Users
> comparing std::hypot(x, y, z) against a simple std::sqrt(x * x + y * y + z *
> z) might wonder why they want to use std::hypot if it's less precise.
> 
> 2. Relatively high cost (in latency and throughput) because of the three
> divisions. You could replace it with scale = 1/max; x *= scale; ... But that
> can reduce precision even further.
> 
> 3. Summation of the x, y, and z squares isn't associative if you care about
> precision. A high quality implementation needs to add the two lowest values
> first.
> 
> Here's a precise but inefficient implementation:
> (https://compiler-explorer.com/z/ocGPnsYE3)
> 
> template 
> [[gnu::optimize("-fno-unsafe-math-optimizations")]]
> T
> hypot3(T x, T y, T z)
> {
>   x = std::abs(x);
>   y = std::abs(y);
>   z = std::abs(z);
>   if (std::isinf(x) || std::isinf(y) || std::isinf(z))
> return std::__infinity_v;
>   else if (std::isnan(x) || std::isnan(y) || std::isnan(z))
> return std::__quiet_NaN_v;
>   else if (x == y && y == z)
> return x * std::sqrt(T(3));
>   else if (z == 0 && y == 0)
> return x;
>   else if (x == 0 && z == 0)
> return y;
>   else if (x == 0 && y == 0)
> return z;
>   else
> {
>   T hi = std::max(std::max(x, y), z);
>   T lo0 = std::min(std::max(x, y), z);
>   T lo1 = std::min(x, y);
>   int e = 0;
>   hi = std::frexp(hi, &e);
>   lo0 = std::ldexp(lo0, -e);
>   lo1 = std::ldexp(lo1, -e);
>   T lo = lo0 * lo0 + lo1 * lo1;
>   return std::ldexp(std::sqrt(hi * hi + lo), e);
> }
> }
> 
> AFAIK
> https://gcc.gnu.org/git/?p=gcc.git;a=blob;f=libstdc%2B%2B-v3/include/
> experimental/bits/simd_math.h;h=06e7b4496f9917f886f66fbd7629700dd17e55f9;
> hb=HEAD#l1168 is a precise and efficient implementation. It also avoids
> division altogether unless an input is subnormal.

What glibc does there for the 2 argument hypot is after handling the non-finite
cases finds the minimum and maximum and uses just normal multiplication,
addition + sqrt for the common case where maximum isn't too large and minimum
isn't too small.
So, no need to use frexp/ldexp, just comparisons of hi above against sqrt of
(max finite / 3), in that case scale by multiplying all 3 args by some
appropriate scale constant, and similarly otherwise if lo1 is too small by some
large scale.

[Bug libstdc++/77776] C++17 std::hypot implementation is poor

2024-03-04 Thread mkretz at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6

--- Comment #15 from Matthias Kretz (Vir)  ---
Your implementation still needs to solve:

1. Loss of precision because of division & subsequent scaling by max. Users
comparing std::hypot(x, y, z) against a simple std::sqrt(x * x + y * y + z * z)
might wonder why they want to use std::hypot if it's less precise.

2. Relatively high cost (in latency and throughput) because of the three
divisions. You could replace it with scale = 1/max; x *= scale; ... But that
can reduce precision even further.

3. Summation of the x, y, and z squares isn't associative if you care about
precision. A high quality implementation needs to add the two lowest values
first.

Here's a precise but inefficient implementation:
(https://compiler-explorer.com/z/ocGPnsYE3)

template 
[[gnu::optimize("-fno-unsafe-math-optimizations")]]
T
hypot3(T x, T y, T z)
{
  x = std::abs(x);
  y = std::abs(y);
  z = std::abs(z);
  if (std::isinf(x) || std::isinf(y) || std::isinf(z))
return std::__infinity_v;
  else if (std::isnan(x) || std::isnan(y) || std::isnan(z))
return std::__quiet_NaN_v;
  else if (x == y && y == z)
return x * std::sqrt(T(3));
  else if (z == 0 && y == 0)
return x;
  else if (x == 0 && z == 0)
return y;
  else if (x == 0 && y == 0)
return z;
  else
{
  T hi = std::max(std::max(x, y), z);
  T lo0 = std::min(std::max(x, y), z);
  T lo1 = std::min(x, y);
  int e = 0;
  hi = std::frexp(hi, &e);
  lo0 = std::ldexp(lo0, -e);
  lo1 = std::ldexp(lo1, -e);
  T lo = lo0 * lo0 + lo1 * lo1;
  return std::ldexp(std::sqrt(hi * hi + lo), e);
}
}

AFAIK
https://gcc.gnu.org/git/?p=gcc.git;a=blob;f=libstdc%2B%2B-v3/include/experimental/bits/simd_math.h;h=06e7b4496f9917f886f66fbd7629700dd17e55f9;hb=HEAD#l1168
is a precise and efficient implementation. It also avoids division altogether
unless an input is subnormal.

[Bug libstdc++/77776] C++17 std::hypot implementation is poor

2024-03-03 Thread de34 at live dot cn via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6

Jiang An  changed:

   What|Removed |Added

 CC||de34 at live dot cn

--- Comment #14 from Jiang An  ---
(In reply to g.peterhoff from comment #11)

> constexpr
> * The hypot3 functions can thus be defined as _GLIBCXX_CONSTEXPR.

I think it should be marked _GLIBCXX26_CONSTEXPR.
(and we may also need to change bits/c++config)

[Bug libstdc++/77776] C++17 std::hypot implementation is poor

2024-03-02 Thread g.peterhoff--- via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6

--- Comment #13 from g.peterh...@t-online.de ---
Thanks for the suggestions:
template 
constexpr _Tp __hypot3(_Tp __x, _Tp __y, _Tp __z)   noexcept
{
if (std::isinf(__x) | std::isinf(__y) | std::isinf(__z))
[[__unlikely__]]
return _Tp(INFINITY);
__x = std::fabs(__x);
__y = std::fabs(__y);
__z = std::fabs(__z);
const _Tp __max = std::fmax(std::fmax(__x, __y), __z);
if (__max == _Tp{}) [[__unlikely__]]
return __max;
__x /= __max;
__y /= __max;
__z /= __max;
return std::sqrt(__x*__x + __y*__y + __z*__z) * __max;
}

The functions are then set to constexpr/noexcept.

regards
Gero

[Bug libstdc++/77776] C++17 std::hypot implementation is poor

2024-03-01 Thread redi at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6

--- Comment #12 from Jonathan Wakely  ---
(In reply to g.peterhoff from comment #11)
> Would this be a good implementation for hypot3 in cmath?

Thanks for the suggestion!

> #define GCC_UNLIKELY(x) __builtin_expect(x, 0)
> #define GCC_LIKELY(x) __builtin_expect(x, 1)
> 
> namespace __detail
> {
>   template 
>   inline _GLIBCXX_CONSTEXPR typename 
> enable_if::value,
> bool>::type   __isinf3(const _Tp __x, const _Tp __y, const _Tp __z)   noexcept
>   {
>   return bool(int(std::isinf(__x)) | int(std::isinf(__y)) |
> int(std::isinf(__z)));

The casts are redundant and just make it harder to read IMHO:

  return std::isinf(__x) | std::isinf(__y) | std::isinf(__z);



>   }
> 
>   template 
>   inline _GLIBCXX_CONSTEXPR typename 
> enable_if::value,
> _Tp>::type__hypot3(_Tp __x, _Tp __y, _Tp __z) noexcept
>   {
>   __x = std::fabs(__x);
>   __y = std::fabs(__y);
>   __z = std::fabs(__z);
> 
>   const _Tp
>   __max = std::fmax(std::fmax(__x, __y), __z);
> 
>   if (GCC_UNLIKELY(__max == _Tp{}))
>   {
>   return __max;
>   }
>   else
>   {
>   __x /= __max;
>   __y /= __max;
>   __z /= __max;
>   return std::sqrt(__x*__x + __y*__y + __z*__z) * __max;
>   }
>   }
> } //  __detail
> 
> 
>   template 
>   inline _GLIBCXX_CONSTEXPR typename 
> enable_if::value,
> _Tp>::type__hypot3(const _Tp __x, const _Tp __y, const _Tp __z)   noexcept

This is a C++17 function, you can use enable_if_t>,
but see below.

>   {
>   return (GCC_UNLIKELY(__detail::__isinf3(__x, __y, __z))) ?
> numeric_limits<_Tp>::infinity() : __detail::__hypot3(__x, __y, __z);
>   }
> 
> #undef GCC_UNLIKELY
> #undef GCC_LIKELY
> 
> How does it work?
> * Basically, I first pull out the special case INFINITY (see
> https://en.cppreference.com/w/cpp/numeric/math/hypot).
> * As an additional safety measure (to prevent misuse) the functions are
> defined by enable_if.

I don't think we want to slow down compilation like that. If users decide to
misuse std::__detail::__isinf3 then they get what they deserve.


> constexpr
> * The hypot3 functions can thus be defined as _GLIBCXX_CONSTEXPR.

Just use 'constexpr' because this function isn't compiled as C++98. Then you
don't need the 'inline'. Although the standard doesn't allow std::hypot3 to be
constexpr.

> Questions
> * To get a better runtime behavior I define GCC_(UN)LIKELY. Are there
> already such macros (which I have overlooked)?

No, but you can do:


  if (__isinf3(__x, __y, __x)) [[__unlikely__]]
...


> * The functions are noexcept. Does that make sense? If yes: why are the math
> functions not noexcept?

I think it's just because nobody bothered to add them, and I doubt much code
ever needs to check whether they are noexcept. The compiler should already know
the standard libm functions don't throw. For this function (which isn't in libm
and the compiler doesn't know about) it seems worth adding 'noexcept'.


Does splitting it into three functions matter? It seems simpler as a single
function:

  template
constexpr _Tp
__hypot3(_Tp __x, _Tp __y, _Tp __z) noexcept
{
  if (std::isinf(__x) | std::isinf(__y) | std::isinf(__z)) [[__unlikely__]]
return numeric_limits<_Tp>::infinity();
  __x = std::fabs(__x);
  __y = std::fabs(__y);
  __z = std::fabs(__z);
  const _Tp __max = std::fmax(std::fmax(__x, __y), __z);
  if (__max == _Tp{}) [[__unlikely__]]
return __max;
  else
{
  __x /= __max;
  __y /= __max;
  __z /= __max;
  return std::sqrt(__x*__x + __y*__y + __z*__z) * __max;
}
}

This would add a dependency on  to , which isn't currently
there. Maybe we could just return (_Tp)__builtin_huge_vall().

[Bug libstdc++/77776] C++17 std::hypot implementation is poor

2024-02-29 Thread g.peterhoff--- via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6

g.peterh...@t-online.de changed:

   What|Removed |Added

 CC||g.peterh...@t-online.de

--- Comment #11 from g.peterh...@t-online.de ---
Would this be a good implementation for hypot3 in cmath?

#define GCC_UNLIKELY(x) __builtin_expect(x, 0)
#define GCC_LIKELY(x) __builtin_expect(x, 1)

namespace __detail
{
template 
inline _GLIBCXX_CONSTEXPR typename
enable_if::value, bool>::type __isinf3(const _Tp __x,
const _Tp __y, const _Tp __z)   noexcept
{
return bool(int(std::isinf(__x)) | int(std::isinf(__y)) |
int(std::isinf(__z)));
}

template 
inline _GLIBCXX_CONSTEXPR typename
enable_if::value, _Tp>::type  __hypot3(_Tp __x, _Tp __y,
_Tp __z) noexcept
{
__x = std::fabs(__x);
__y = std::fabs(__y);
__z = std::fabs(__z);

const _Tp
__max = std::fmax(std::fmax(__x, __y), __z);

if (GCC_UNLIKELY(__max == _Tp{}))
{
return __max;
}
else
{
__x /= __max;
__y /= __max;
__z /= __max;
return std::sqrt(__x*__x + __y*__y + __z*__z) * __max;
}
}
}   //  __detail


template 
inline _GLIBCXX_CONSTEXPR typename
enable_if::value, _Tp>::type  __hypot3(const _Tp __x,
const _Tp __y, const _Tp __z)   noexcept
{
return (GCC_UNLIKELY(__detail::__isinf3(__x, __y, __z))) ?
numeric_limits<_Tp>::infinity() : __detail::__hypot3(__x, __y, __z);
}

#undef GCC_UNLIKELY
#undef GCC_LIKELY

How does it work?
* Basically, I first pull out the special case INFINITY (see
https://en.cppreference.com/w/cpp/numeric/math/hypot).
* As an additional safety measure (to prevent misuse) the functions are defined
by enable_if.

constexpr
* The hypot3 functions can thus be defined as _GLIBCXX_CONSTEXPR.

Questions
* To get a better runtime behavior I define GCC_(UN)LIKELY. Are there already
such macros (which I have overlooked)?
* The functions are noexcept. Does that make sense? If yes: why are the math
functions not noexcept?

thx
Gero

[Bug libstdc++/77776] C++17 std::hypot implementation is poor

2021-05-04 Thread rguenth at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6

Richard Biener  changed:

   What|Removed |Added

 Status|NEW |ASSIGNED

[Bug libstdc++/77776] C++17 std::hypot implementation is poor

2019-01-14 Thread kretz at kde dot org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6

--- Comment #10 from Matthias Kretz  ---
Experience from testing my simd implementation:

I had failures (2 ULP deviation from long double result) when using 

auto __xx = abs(__x);
auto __yy = abs(__y);
auto __zz = abs(__z);
auto __hi = max(max(__xx, __yy), __zz);
auto __l0 = min(__zz, max(__xx, __yy));
auto __l1 = min(__yy, __xx);
__l0 /= __hi;
__l1 /= __hi;
auto __lo = __l0 * __l0 + __l1 * __l1;
return __hi * sqrt(1 + __lo);

Where the failures occur depends on wether FMA instructions are used. I have
observed only 1 ULP deviation from long double with my algorithm (independent
of FMAs).

Here are two data points that seem challenging:

hypot(0x1.965372p+125f, 0x1.795c92p+126f, 0x1.d0fc96p+125f) -> 0x1.e79366p+126f

hypot(0x1.235f24p+125f, 0x1.5b88f4p+125f, 0x1.d57828p+124f) -> 0x1.feaa26p+125f

[Bug libstdc++/77776] C++17 std::hypot implementation is poor

2019-01-10 Thread kretz at kde dot org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6

--- Comment #9 from Matthias Kretz  ---
(In reply to emsr from comment #7)
> What does this do?
> 
>   auto __hi_exp =
> __hi & simd<_T, _Abi>(std::numeric_limits<_T>::infinity()); // no error

component-wise bitwise and of __hi and +inf. Or in other words, it sets all
sign bits and mantissa bits to 0. Consequently `__hi / __hi_exp` returns __hi
with the exponent bits set to 0x3f8 (float) / 0x3ff (double) and the mantissa
bits unchanged.

> Sorry, I have no simd knowlege yet.

It's a very simple idea:
- simd holds simd::size() many values of type T.
- All operators/operations act component-wise.

See Section 9 in wg21.link/N4793 for the last WD of the TS.

> Anyway, doesn't the large scale risk overflow if a, b are large? I guess I'm
> lost.

It basically has the same underflow risks as your implementation does, and no
risk of overflow.

[Bug libstdc++/77776] C++17 std::hypot implementation is poor

2019-01-10 Thread emsr at gcc dot gnu.org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6

--- Comment #8 from emsr at gcc dot gnu.org ---
(In reply to Matthias Kretz from comment #6)

> > How precise is hypot supposed to be? I know it is supposed to try and avoid
> > spurious overflow/underflow, but I am not convinced that it should aim for
> > correct rounding.
> 
> That's a good question for all of  / . Any normative wording
> on that question would be (welcome) news to me. AFAIK precision is left
> completely as QoI. So, except for the Annex F requirements (which we can
> drop with -ffast-math), let's implement all of  as `return 0;`. ;-)
> 

It looks like C is trying to incorporate ISO/IEC TS 18661-1 Floating-point
extensions for C — Part 1: Binary floating-point arithmetic, etc.

This adds a lot of rounding control but it looks like correcly rounded
transcendentals are still merely a recommendation.

[Bug libstdc++/77776] C++17 std::hypot implementation is poor

2019-01-10 Thread emsr at gcc dot gnu.org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6

--- Comment #7 from emsr at gcc dot gnu.org ---
What does this do?

  auto __hi_exp =
__hi & simd<_T, _Abi>(std::numeric_limits<_T>::infinity()); // no error

Sorry, I have no simd knowlege yet.

Anyway, doesn't the large scale risk overflow if a, b are large? I guess I'm
lost.

[Bug libstdc++/77776] C++17 std::hypot implementation is poor

2019-01-10 Thread kretz at kde dot org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6

--- Comment #6 from Matthias Kretz  ---
(In reply to Marc Glisse from comment #4)
> Your "reference" number seems strange. Why not do the computation with
> double (or long double or mpfr) or use __builtin_hypotf? Note that it
> changes the value.

Doh. (I didn't know the builtin exists. But use of (long) double should have
been a no-brainer.) I guess my point was the precision of the input to sqrt not
the result of sqrt. The sqrt makes that error almost irrelevant, though. My
numerical analysis skills are not good enough to argue for what approach is
better. But intuitively, keeping the information of the `amax` mantissa for the
final multiplication around might actually make that approach slightly better
(if the input to the sqrt were precise that wouldn't be true, though - but it
never is).

> How precise is hypot supposed to be? I know it is supposed to try and avoid
> spurious overflow/underflow, but I am not convinced that it should aim for
> correct rounding.

That's a good question for all of  / . Any normative wording on
that question would be (welcome) news to me. AFAIK precision is left completely
as QoI. So, except for the Annex F requirements (which we can drop with
-ffast-math), let's implement all of  as `return 0;`. ;-)

> (I see that you are using clang in that godbolt link, with gcc I need to
> mark the global variables with "extern const" to get a similar asm)

Thanks for the hint. I switched to clang when GCC started to produce code
instead of constants in the asm. (I also like the unicode identifier support in
clang ;-))

[Bug libstdc++/77776] C++17 std::hypot implementation is poor

2019-01-09 Thread emsr at gcc dot gnu.org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6

--- Comment #5 from emsr at gcc dot gnu.org ---
Right.  fma is irrelevant.
I will wind up with sqrt(1 + __lo).
I won't hope that max * __scale == 1 here but just add 1.  And why waste the
partial sort?

New patch tomorrow a.m. (I guess I'm too late though.)

[Bug libstdc++/77776] C++17 std::hypot implementation is poor

2019-01-05 Thread glisse at gcc dot gnu.org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6

--- Comment #4 from Marc Glisse  ---
(In reply to Matthias Kretz from comment #3)
> Did you consider the error introduced by scaling with __amax? I made sure
> that the division is without error by zeroing the mantissa bits. Here's a
> motivating example that shows an error of 1 ulp otherwise:
> https://godbolt.org/z/_U2K7e

Your "reference" number seems strange. Why not do the computation with double
(or long double or mpfr) or use __builtin_hypotf? Note that it changes the
value.

How precise is hypot supposed to be? I know it is supposed to try and avoid
spurious overflow/underflow, but I am not convinced that it should aim for
correct rounding.

(I see that you are using clang in that godbolt link, with gcc I need to mark
the global variables with "extern const" to get a similar asm)

> About std::fma, how bad is the performance hit if there's no instruction for
> it?

FMA doesn't seem particularly relevant here.

[Bug libstdc++/77776] C++17 std::hypot implementation is poor

2019-01-05 Thread kretz at kde dot org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6

--- Comment #3 from Matthias Kretz  ---
Did you consider the error introduced by scaling with __amax? I made sure that
the division is without error by zeroing the mantissa bits. Here's a motivating
example that shows an error of 1 ulp otherwise: https://godbolt.org/z/_U2K7e

About std::fma, how bad is the performance hit if there's no instruction for
it?

[Bug libstdc++/77776] C++17 std::hypot implementation is poor

2019-01-04 Thread emsr at gcc dot gnu.org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6

--- Comment #2 from emsr at gcc dot gnu.org ---
I have this in another tree which solves the inf issue:
  namespace __detail
  {
// Avoid including all of 
template
  constexpr _Tp
  __fmax3(_Tp __x, _Tp __y, _Tp __z)
  { return std::fmax(std::fmax(__x, __y), std::fmax(__y, __z)); }

template
  constexpr _Tp
  __hypot3(_Tp __x, _Tp __y, _Tp __z)
  {
if (std::isnan(__x)
 || std::isnan(__y)
 || std::isnan(__z))
  return std::numeric_limits<_Tp>::quiet_NaN();
else
  {
__x = std::abs(__x);
__y = std::abs(__y);
__z = std::abs(__z);
const auto __amax = __fmax3(__x, __y, __z);
if (__amax == _Tp{0})
  return _Tp{0};
else if (std::__detail::__isinf(__amax))
  return std::numeric_limits<_Tp>::infinity();
else
  {
__x /= __amax;
__y /= __amax;
__z /= __amax;
return __amax * std::sqrt(__x * __x + __y * __y + __z * __z);
  }
  }
  }
  }

I get your point about scaling and adding smallest first.  I have access to
std::fma().

For me I think this means:
/*
  const auto __amax = max(max(__x, __y), __z);
  const auto __l0 = min(__z, max(__x, __y));
  const auto __l1 = min(__y, __x);

  const auto __scale = 1 / __amax;

  __l0 *= __scale;
  __l1 *= __scale;
  // Add the two smaller values first.
  const auto __lo = __l0 * __l0 + __l1 * __l1;
  const auto __r  = __amax * sqrt(fma(__h1, __h1, __lo));

*/

[Bug libstdc++/77776] C++17 std::hypot implementation is poor

2018-12-06 Thread kretz at kde dot org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6

Matthias Kretz  changed:

   What|Removed |Added

 CC||kretz at kde dot org

--- Comment #1 from Matthias Kretz  ---
Testcase for the inf input case
(https://wandbox.org/permlink/wDla0jQOtKFUsHNs):

  constexpr float inf = std::numeric_limits::infinity();
  std::cout << std::hypot(inf, 1.f, 1.f) << '\n';
  std::cout << std::hypot(inf, inf, 1.f) << '\n';
  std::cout << std::hypot(inf, inf, inf) << '\n';

All of these return NaN, but surely should return inf instead. Joseph mentions
this issue in his mails.


here's what I currently have for std::experimental::simd's hypot overload:

template 
simd<_T, _Abi> hypot(simd<_T, _Abi> __x, simd<_T, _Abi> __y, simd<_T, _Abi>
__z)
{
  using namespace __proposed::float_bitwise_operators;
  __x   = abs(__x);// no error
  __y   = abs(__y);// no error
  __z   = abs(__z);// no error
  auto __hi = max(max(__x, __y), __z); // no error
  auto __l0 = min(__z, max(__x, __y)); // no error
  auto __l1 = min(__y, __x);   // no error
  auto __hi_exp =
__hi & simd<_T, _Abi>(std::numeric_limits<_T>::infinity()); // no error
  where(__hi_exp == 0, __hi_exp) = std::numeric_limits<_T>::min();

  // scale __hi to 0x1.???p0 and __lo by the same factor
  auto __scale = 1 / __hi_exp;   // no error
  auto __h1  = __hi / __hi_exp; // no error
  __l0 *= 1 / __hi_exp; // no error
  __l1 *= 1 / __hi_exp; // no error
  auto __lo = __l0 * __l0 + __l1 * __l1; // add the two smaller values first
  auto __r  = __hi_exp * sqrt(__lo + __h1 * __h1);
  where(__l0 + __l1 == 0, __r) = __hi;
  where(isinf(__x) || isinf(__y) || isinf(__z), __r) =
std::numeric_limits<_T>::infinity();
  return __r;
}

IIUC this might still produce underflow exceptions, even though the answer is
precise. And without FMA in the accumulation inside the sqrt there may be a
larger than .5 ULP error on the result. But that's just to the best of my
knowledge.

[Bug libstdc++/77776] C++17 std::hypot implementation is poor

2017-12-12 Thread redi at gcc dot gnu.org
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6

Jonathan Wakely  changed:

   What|Removed |Added

 Status|UNCONFIRMED |NEW
   Last reconfirmed||2017-12-12
 Ever confirmed|0   |1