[fpc-pascal] Functions for dealing with floating-point precision

Thomas Kurz via fpc-pascal Sun, 19 Jun 2022 13:38:48 -0700

Hello,

in my program I have need for checking floating-point precision. I'm internally 
using floating-points for calculations, but in the end I have to use integer 
numbers. I cannot use Round() because I have to check for thresholds. I.e. that 
I wish to accept a value of 1.9999.... as being ">=2". As you can see, I cannot 
use Trunc() either, so I decided to check for the difference in ULPs (unit in 
the last place).


I couldn't find the corresponding functions in the RTL, but I think they should 
be included. I wrote the functions listed below. I would appreciate if the 
maintainers of FPC decide to add them to the Math unit. I skipped the functions 
for the Extended type, but if it's being desired, I can add hand them in later.

Kind regards,
Thomas


*** code:


const MAX_ULP_SINGLE = 2.028240960E+31;
const MAX_ULP_DOUBLE = 1.9958403095347198E+292;

function SingleToLongBits (Argument: Single): UInt32;
  begin
    Move (Argument, Result, SizeOf(UInt32));
  end;

{$ifdef FPC_HAS_TYPE_DOUBLE}
function DoubleToLongBits (Argument: Double): UInt64;
  begin
    Move (Argument, Result, SizeOf(UInt64));
  end;
{$endif FPC_HAS_TYPE_DOUBLE}

function LongBitsToSingle (Argument: UInt32): Single;
  begin
    Move (Argument, Result, SizeOf(Single));
  end;

{$ifdef FPC_HAS_TYPE_DOUBLE}
function LongBitsToDouble (Argument: UInt64): Double;
  begin
    Move (Argument, Result, SizeOf(Double));
  end;
{$endif FPC_HAS_TYPE_DOUBLE}

// returns the first floating-point argument with the sign of the second 
floating-point argument
function CopySign (x: Single; y: Single): Single;
  begin
    Exit (Abs(x) * Sign(y));
  end;

{$ifdef FPC_HAS_TYPE_DOUBLE}
// returns the first floating-point argument with the sign of the second 
floating-point argument
function CopySign (x: Double; y: Double): Double;
  begin
    Exit (Abs(x) * Sign(y));
  end;
{$endif FPC_HAS_TYPE_DOUBLE}

function IsSameSign (x: Single; y: Single): Boolean;
  begin
    Exit (CopySign(x, y) = x);
  end;

{$ifdef FPC_HAS_TYPE_DOUBLE}
function IsSameSign (x: Double; y: Double): Boolean;
  begin
    Exit (CopySign(x, y) = x);
  end;
{$endif FPC_HAS_TYPE_DOUBLE}

function NextAfter (Start: Single; Direction: Single): Single;
  var
    absStart: Single;
    absDir: Single;
    toZero: Boolean;
  begin
    // If either argument is a NaN, then NaN is returned.
    if IsNan(Start) OR IsNan(Direction) then Exit (NaN);
    // If both arguments compare as equal the second argument is returned.
    if (Start = Direction) THEN Exit (Direction);
    absStart := Abs(Start);
    absDir := Abs(Direction);
    toZero := NOT IsSameSign (Start, Direction) OR (AbsDir < absStart);
    if (toZero) then begin
      // we are reducing the magnitude, going toward zero.
      if (absStart = MinSingle) then Exit (CopySign (0.0, Start));
      if IsInfinite(absStart) THEN Exit (CopySign (MaxSingle, Start));
      Exit (CopySign (LongBitsToSingle (Pred(SingleToLongBits(absStart))), 
Start));
    end else begin
      // we are increasing the magnitude, toward +-Infinity
      if (Start = 0.0) then Exit (CopySign (MinSingle, Direction));
      if (absStart = MaxSingle) then Exit (CopySign (Infinity, Start));
      Exit (CopySign (LongBitsToSingle(Succ(SingleToLongBits(absStart))), 
Start));
    end;
  end;

{$ifdef FPC_HAS_TYPE_DOUBLE}
function NextAfter (Start: Double; Direction: Double): Double;
  var
    absStart: Double;
    absDir: Double;
    toZero: Boolean;
  begin
    // If either argument is a NaN, then NaN is returned.
    if IsNan(Start) OR IsNan(Direction) then Exit (NaN);
    // If both arguments compare as equal the second argument is returned.
    if (Start = Direction) THEN Exit (Direction);
    absStart := Abs(Start);
    absDir := Abs(Direction);
    toZero := NOT IsSameSign (Start, Direction) OR (AbsDir < AbsStart);

    if (toZero) then begin
      // we are reducing the magnitude, going toward zero.
      if (absStart = MinDouble) then Exit (CopySign (0.0, Start));
      if IsInfinite(absStart) THEN Exit (CopySign (MaxDouble, Start));
      Exit (CopySign (LongBitsToDouble (Pred(DoubleToLongBits(absStart))), 
Start));
    end else begin
      // we are increasing the magnitude, toward +-Infinity
      if (Start = 0.0) then Exit (CopySign (MinDouble, Direction));
      if (AbsStart = MaxDouble) then Exit (CopySign (Infinity, Start));
      Exit (CopySign (LongBitsToDouble(Succ(DoubleToLongBits(absStart))), 
start));
    end;
  end;
{$endif FPC_HAS_TYPE_DOUBLE}

function FloatPrior (Argument: Single): Single;
  begin
    Exit (NextAfter (Argument, NegInfinity));
  end;

function FloatNext (Argument: Single): Single;
  begin
    Exit (NextAfter (Argument, Infinity));
  end;

{$ifdef FPC_HAS_TYPE_DOUBLE}
function FloatPrior (Argument: Double): Double;
  begin
    Exit (NextAfter (Argument, NegInfinity));
  end;
{$endif FPC_HAS_TYPE_DOUBLE}

{$ifdef FPC_HAS_TYPE_DOUBLE}
function FloatNext (Argument: Double): Double;
  begin
    Exit (NextAfter (Argument, Infinity));
  end;
{$endif FPC_HAS_TYPE_DOUBLE}

function ULP (Argument: Single): Single;
  begin
    // If the argument is NaN, then the result is NaN.
    if IsNaN (Argument) then Exit (NaN);
    // If the argument is positive or negative infinity, then the result is 
positive infinity.
    if IsInfinite (Argument) then Exit (Infinity);
    // If the argument is positive or negative zero, then the result is 
Single.MIN_VALUE.
    if (Argument = 0.0) then Exit (MinSingle);
    Argument := Abs(Argument);
    // If the argument is ±Single.MAX_VALUE, then the result is equal to 2^104.
    if (Argument = MaxSingle) then Exit (MAX_ULP_SINGLE);
    Exit (FloatNext(Argument) - Argument);
  end;

{$ifdef FPC_HAS_TYPE_DOUBLE}
function ULP (Argument: Double): Double;
  begin
    // If the argument is NaN, then the result is NaN.
    if IsNaN (Argument) then Exit (NaN);
    // If the argument is positive or negative infinity, then the result is 
positive infinity.
    if IsInfinite (Argument) then Exit (Infinity);
    // If the argument is positive or negative zero, then the result is 
Double.MIN_VALUE.
    if (Argument = 0.0) then Exit (MinDouble);
    Argument := Abs(Argument);
    // If the argument is ±Double.MAX_VALUE, then the result is equal to 2^971.
    if (Argument = MaxDouble) then Exit (MAX_ULP_DOUBLE);
    Exit (FloatNext(Argument) - Argument);
  end;
{$endif FPC_HAS_TYPE_DOUBLE}

_______________________________________________
fpc-pascal maillist  -  fpc-pascal@lists.freepascal.org
https://lists.freepascal.org/cgi-bin/mailman/listinfo/fpc-pascal

[fpc-pascal] Functions for dealing with floating-point precision

Reply via email to