[HACKERS] Current int & float overflow checking is slow.

Andres Freund Tue, 24 Oct 2017 03:40:35 -0700

Hi,

In analytics queries that involve a large amounts of integers and/or
floats (i.e. a large percentage) it's quite easy to see the functions
underlying the operators in profiles. Partially that's the function call
overhead, but even *after* removing most of that via JITing, they're
surprisingly expensive.


Largely that's due to the overflow checks.

For integers we currently do:

#define SAMESIGN(a,b)   (((a) < 0) == ((b) < 0))

        /*
         * Overflow check.  If the inputs are of different signs then their sum
         * cannot overflow.  If the inputs are of the same sign, their sum had
         * better be that sign too.
         */
        if (SAMESIGN(arg1, arg2) && !SAMESIGN(result, arg1))
                ereport(ERROR,
                                (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
                                 errmsg("integer out of range")));

which means that we turn a single integer instruction into ~10,
including a bunch of branches.  All that despite the fact that most
architectures have flag registers signalling integer overflow. It's just
that C doesn't easily make that available.

gcc exposes more efficient overflow detection via intrinsics:
https://gcc.gnu.org/onlinedocs/gcc-7.1.0/gcc/Integer-Overflow-Builtins.html

Using that turns the non-error path from int4pl from:

   0x0000000000826ec0 <+0>:     mov    0x20(%rdi),%rcx # arg1
   0x0000000000826ec4 <+4>:     mov    0x28(%rdi),%rdx # arg2
   0x0000000000826ec8 <+8>:     mov    %ecx,%esi
   0x0000000000826eca <+10>:    lea    (%rdx,%rcx,1),%eax # add
   # overflow check
   0x0000000000826ecd <+13>:    shr    $0x1f,%edx
   0x0000000000826ed0 <+16>:    not    %esi
   0x0000000000826ed2 <+18>:    shr    $0x1f,%esi
   0x0000000000826ed5 <+21>:    cmp    %dl,%sil
   0x0000000000826ed8 <+24>:    je     0x826f30 <int4pl+112>
   0x0000000000826eda <+26>:    mov    %eax,%edx
   0x0000000000826edc <+28>:    shr    $0x1f,%ecx
   0x0000000000826edf <+31>:    shr    $0x1f,%edx
   0x0000000000826ee2 <+34>:    cmp    %cl,%dl
   0x0000000000826ee4 <+36>:    je     0x826f30 <int4pl+112>
   /* overflow error code */
   0x0000000000826f30 <+112>:   retq

into

   0x0000000000826ec0 <+0>:     mov    0x28(%rdi),%rax # arg2
   0x0000000000826ec4 <+4>:     add    0x20(%rdi),%eax # arg1 + arg2
   0x0000000000826ec7 <+7>:     jo     0x826ecc <int4pl+12> # jump if overflowed
   0x0000000000826ec9 <+9>:     mov    %eax,%eax # clear high bits
   0x0000000000826ecb <+11>:    retq

which, not that surprisingly, is faster. Not to speak of easier to read
;)

Besides the fact that the code is faster, there's also the issue that
the current way to do overflow checks is not actually correct C, and
requires compiler flags like -fwrapv.


For floating point it's even worse.

/*
 * check to see if a float4/8 val has underflowed or overflowed
 */
#define CHECKFLOATVAL(val, inf_is_valid, zero_is_valid)                 \
do {                                                                            
                                        \
        if (isinf(val) && !(inf_is_valid))                                      
                \
                ereport(ERROR,                                                  
                                \
                                (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),   
\
                  errmsg("value out of range: overflow")));                     
        \
                                                                                
                                                \
        if ((val) == 0.0 && !(zero_is_valid))                                   
        \
                ereport(ERROR,                                                  
                                \
                                (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),   
\
                 errmsg("value out of range: underflow")));                     
        \
} while(0)

    result = arg1 + arg2;

    /*
     * There isn't any way to check for underflow of addition/subtraction
     * because numbers near the underflow value have already been rounded to
     * the point where we can't detect that the two values were originally
     * different, e.g. on x86, '1e-45'::float4 == '2e-45'::float4 ==
     * 1.4013e-45.
     */
    CHECKFLOATVAL(result, isinf(arg1) || isinf(arg2), true);

The disassembled code for float4pl is:
   0x000000000043ce90 <+0>:     vmovss 0x20(%rdi),%xmm1
   0x000000000043ce95 <+5>:     vmovss 0x28(%rdi),%xmm2
   0x000000000043ce9a <+10>:    vmovss 0x2b6a7e(%rip),%xmm3        # 0x6f3920
   0x000000000043cea2 <+18>:    vaddss %xmm1,%xmm2,%xmm0
   0x000000000043cea6 <+22>:    vmovaps %xmm0,%xmm4
   0x000000000043ceaa <+26>:    vandps %xmm3,%xmm4,%xmm4
   0x000000000043ceae <+30>:    vucomiss 0x2b6a4a(%rip),%xmm4        # 0x6f3900
   0x000000000043ceb6 <+38>:    jbe    0x43ced4 <float4pl+68>
   0x000000000043ceb8 <+40>:    vandps %xmm3,%xmm1,%xmm1
   0x000000000043cebc <+44>:    vucomiss 0x2b6a3c(%rip),%xmm1        # 0x6f3900
   0x000000000043cec4 <+52>:    ja     0x43ced4 <float4pl+68>
   0x000000000043cec6 <+54>:    vandps %xmm3,%xmm2,%xmm2
   0x000000000043ceca <+58>:    vucomiss 0x2b6a2e(%rip),%xmm2        # 0x6f3900
   0x000000000043ced2 <+66>:    jbe    0x43ced9 <float4pl+73>
   0x000000000043ced4 <+68>:    vmovd  %xmm0,%eax
   0x000000000043ced8 <+72>:    retq
   0x000000000043ced9 <+73>:    push   %rbx
   # call to ereport

clang's code is much worse, it generates *external* function calls for
isinf (can be fixed by redefining isinf to __builtin_isinf).

Entirely removing the overflow checks results in:
   0x0000000000801850 <+0>:     vmovss 0x28(%rdi),%xmm1 # arg2
   0x0000000000801855 <+5>:     vaddss 0x20(%rdi),%xmm1,%xmm0 # arg1 + arg2
   0x000000000080185a <+10>:    vmovd  %xmm0,%eax # convert to int
   0x000000000080185e <+14>:    mov    %eax,%eax # clear upper bits
   0x0000000000801860 <+16>:    retq

which unsurprisingly is a good bit faster.  float4mul etc generate even
worse code.

There's no comparable overflow handling to the above integer
intrinsics. But I think we can still do a lot better. Two very different
ways:

1) Just give up on detecting overflows for floats. Generating inf in
   these cases actually seems entirely reasonable. We already don't
   detect them in a bunch of cases anyway.  I can't quite parse the
   standard's language around this.
2) Use platform specific float exception handling where available. We
   could at backend start, and in FloatExceptionHandler(), us
   feenableexcept() (windows has similar) to trigger SIGFPE on float
   overflow.
3) Magic?

Greetings,

Andres Freund


-- 
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

[HACKERS] Current int & float overflow checking is slow.

Reply via email to