On Thu, Sep 11, 2014 at 11:27 AM, Andres Freund <and...@2ndquadrant.com>
wrote:

> On 2014-09-11 10:32:24 -0300, Arthur Silva wrote:
> > Unaligned memory access received a lot attention in Intel post-Nehalen
> era.
> > So it may very well pay off on Intel servers. You might find this blog
> post
> > and it's comments/external-links interesting
> >
> http://lemire.me/blog/archives/2012/05/31/data-alignment-for-speed-myth-or-reality/
>
> FWIW, the reported results of imo pretty meaningless for postgres. It's
> sequential access over larger amount of memory. I.e. a perfectly
> prefetchable workload where it doesn't matter if superflous cachelines
> are fetched because they're going to be needed next round anyway.
>
> In many production workloads one of the most busy accesses to individual
> datums is the binary search on individual pages during index
> lookups. That's pretty much exactly the contrary to the above.
>
> Not saying that it's not going to be a benefit in many scenarios, but
> it's far from being as simple as saying that unaligned accesses on their
> own aren't penalized anymore.
>
> Greetings,
>
> Andres Freund
>
> --
>  Andres Freund                     http://www.2ndQuadrant.com/
>  PostgreSQL Development, 24x7 Support, Training & Services
>

I modified the test code to use a completely random scan pattern to test
something that completely trashes the cache. Not realistic but still
confirms the hypothesis that the overhead is minimal on modern Intel.

------------------ test results compiling for 32bit ------------------
 processing word of size 2
offset = 0
 average time for offset 0 is 422.7
offset = 1
 average time for offset 1 is 422.85

 processing word of size 4
offset = 0
 average time for offset 0 is 436.6
offset = 1
 average time for offset 1 is 451
offset = 2
 average time for offset 2 is 444.3
offset = 3
 average time for offset 3 is 441.9

 processing word of size 8
offset = 0
 average time for offset 0 is 630.15
offset = 1
 average time for offset 1 is 653
offset = 2
 average time for offset 2 is 655.5
offset = 3
 average time for offset 3 is 660.85
offset = 4
 average time for offset 4 is 650.1
offset = 5
 average time for offset 5 is 656.9
offset = 6
 average time for offset 6 is 656.6
offset = 7
 average time for offset 7 is 656.9


------------------ test results compiling for 64bit ------------------
 processing word of size 2
offset = 0
 average time for offset 0 is 402.55
offset = 1
 average time for offset 1 is 406.9

 processing word of size 4
offset = 0
 average time for offset 0 is 424.05
offset = 1
 average time for offset 1 is 436.55
offset = 2
 average time for offset 2 is 435.1
offset = 3
 average time for offset 3 is 435.3

 processing word of size 8
offset = 0
 average time for offset 0 is 444.9
offset = 1
 average time for offset 1 is 470.25
offset = 2
 average time for offset 2 is 468.95
offset = 3
 average time for offset 3 is 476.75
offset = 4
 average time for offset 4 is 474.9
offset = 5
 average time for offset 5 is 468.25
offset = 6
 average time for offset 6 is 469.8
offset = 7
 average time for offset 7 is 469.1
//  g++ -O2 -o test test.cpp && ./test


#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
#include <iostream>
#include <cassert>
#include <vector>
#include "inttypes.h"

using namespace std;

class WallClockTimer
{
public:
    struct timeval t1, t2;
    WallClockTimer() :
        t1(), t2()
    {
        gettimeofday(&t1, 0);
        t2 = t1;
    }
    void reset()
    {
        gettimeofday(&t1, 0);
        t2 = t1;
    }
    int elapsed()
    {
        return (t2.tv_sec * 1000 + t2.tv_usec / 1000) - (t1.tv_sec * 1000 + t1.tv_usec / 1000);
    }
    int split()
    {
        gettimeofday(&t2, 0);
        return elapsed();
    }
};

// xor shift
uint32_t xor128(void)
{
    static uint32_t x = 123456789;
    static uint32_t y = 362436069;
    static uint32_t z = 521288629;
    static uint32_t w = 88675123;
    uint32_t t;
    t = x ^ (x << 11);
    x = y;
    y = z;
    z = w;
    return w = w ^ (w >> 19) ^ (t ^ (t >> 8));
}

template <class T>
void runtest()
{
    size_t N = 10 * 1000 * 1000 ;
    int repeat = 20;
    WallClockTimer timer;
    const bool paranoid = false;
    cout<<" processing word of size "<<sizeof(T)<<endl;
    for(unsigned int offset = 0; offset<sizeof(T); ++offset)
    {
        vector<T> bigarray(N+2);
        cout<<"offset = "<<offset<<endl;
        T * const begin =   reinterpret_cast<T *> (reinterpret_cast<uintptr_t>(&bigarray[0]) + offset);
        assert(offset + reinterpret_cast<uintptr_t>(&bigarray[0])  == reinterpret_cast<uintptr_t>(begin)  );
        T * const end = begin + N;
        if(paranoid) assert(reinterpret_cast<uintptr_t>(end)<reinterpret_cast<uintptr_t>(&bigarray.back()));
        int sumt = 0;
        //cout<<" ignore this: ";
        for(int k = 0 ; k < repeat; ++k)
        {
            timer.reset();
            for(size_t i = 0; i <N; ++i)
            {
                int ri = xor128() % N;
                begin[ri] = static_cast<T>( i );
            }
            volatile T val = 1;
            for(size_t i = 0; i <N; ++i)
            {
                int ri = xor128() % N;
                val += begin[ri] * val  + 33;
            }
            int time = timer.split();
            sumt += time;
            //cout<<val;
        }
        //cout<<endl;
        cout<<" average time for offset "<<(offset%sizeof(T))<<" is "<<sumt * 1.0 /repeat<<endl;
    }

}



int main()
{
    runtest<uint16_t>();
    cout<<endl;
    runtest<uint32_t>();
    cout<<endl;
    runtest<uint64_t>();
    cout<<endl;

    return 0;
}
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to