On Thu, Sep 11, 2014 at 11:27 AM, Andres Freund <and...@2ndquadrant.com> wrote:
> On 2014-09-11 10:32:24 -0300, Arthur Silva wrote: > > Unaligned memory access received a lot attention in Intel post-Nehalen > era. > > So it may very well pay off on Intel servers. You might find this blog > post > > and it's comments/external-links interesting > > > http://lemire.me/blog/archives/2012/05/31/data-alignment-for-speed-myth-or-reality/ > > FWIW, the reported results of imo pretty meaningless for postgres. It's > sequential access over larger amount of memory. I.e. a perfectly > prefetchable workload where it doesn't matter if superflous cachelines > are fetched because they're going to be needed next round anyway. > > In many production workloads one of the most busy accesses to individual > datums is the binary search on individual pages during index > lookups. That's pretty much exactly the contrary to the above. > > Not saying that it's not going to be a benefit in many scenarios, but > it's far from being as simple as saying that unaligned accesses on their > own aren't penalized anymore. > > Greetings, > > Andres Freund > > -- > Andres Freund http://www.2ndQuadrant.com/ > PostgreSQL Development, 24x7 Support, Training & Services > I modified the test code to use a completely random scan pattern to test something that completely trashes the cache. Not realistic but still confirms the hypothesis that the overhead is minimal on modern Intel. ------------------ test results compiling for 32bit ------------------ processing word of size 2 offset = 0 average time for offset 0 is 422.7 offset = 1 average time for offset 1 is 422.85 processing word of size 4 offset = 0 average time for offset 0 is 436.6 offset = 1 average time for offset 1 is 451 offset = 2 average time for offset 2 is 444.3 offset = 3 average time for offset 3 is 441.9 processing word of size 8 offset = 0 average time for offset 0 is 630.15 offset = 1 average time for offset 1 is 653 offset = 2 average time for offset 2 is 655.5 offset = 3 average time for offset 3 is 660.85 offset = 4 average time for offset 4 is 650.1 offset = 5 average time for offset 5 is 656.9 offset = 6 average time for offset 6 is 656.6 offset = 7 average time for offset 7 is 656.9 ------------------ test results compiling for 64bit ------------------ processing word of size 2 offset = 0 average time for offset 0 is 402.55 offset = 1 average time for offset 1 is 406.9 processing word of size 4 offset = 0 average time for offset 0 is 424.05 offset = 1 average time for offset 1 is 436.55 offset = 2 average time for offset 2 is 435.1 offset = 3 average time for offset 3 is 435.3 processing word of size 8 offset = 0 average time for offset 0 is 444.9 offset = 1 average time for offset 1 is 470.25 offset = 2 average time for offset 2 is 468.95 offset = 3 average time for offset 3 is 476.75 offset = 4 average time for offset 4 is 474.9 offset = 5 average time for offset 5 is 468.25 offset = 6 average time for offset 6 is 469.8 offset = 7 average time for offset 7 is 469.1
// g++ -O2 -o test test.cpp && ./test #include <sys/stat.h> #include <sys/time.h> #include <sys/types.h> #include <iostream> #include <cassert> #include <vector> #include "inttypes.h" using namespace std; class WallClockTimer { public: struct timeval t1, t2; WallClockTimer() : t1(), t2() { gettimeofday(&t1, 0); t2 = t1; } void reset() { gettimeofday(&t1, 0); t2 = t1; } int elapsed() { return (t2.tv_sec * 1000 + t2.tv_usec / 1000) - (t1.tv_sec * 1000 + t1.tv_usec / 1000); } int split() { gettimeofday(&t2, 0); return elapsed(); } }; // xor shift uint32_t xor128(void) { static uint32_t x = 123456789; static uint32_t y = 362436069; static uint32_t z = 521288629; static uint32_t w = 88675123; uint32_t t; t = x ^ (x << 11); x = y; y = z; z = w; return w = w ^ (w >> 19) ^ (t ^ (t >> 8)); } template <class T> void runtest() { size_t N = 10 * 1000 * 1000 ; int repeat = 20; WallClockTimer timer; const bool paranoid = false; cout<<" processing word of size "<<sizeof(T)<<endl; for(unsigned int offset = 0; offset<sizeof(T); ++offset) { vector<T> bigarray(N+2); cout<<"offset = "<<offset<<endl; T * const begin = reinterpret_cast<T *> (reinterpret_cast<uintptr_t>(&bigarray[0]) + offset); assert(offset + reinterpret_cast<uintptr_t>(&bigarray[0]) == reinterpret_cast<uintptr_t>(begin) ); T * const end = begin + N; if(paranoid) assert(reinterpret_cast<uintptr_t>(end)<reinterpret_cast<uintptr_t>(&bigarray.back())); int sumt = 0; //cout<<" ignore this: "; for(int k = 0 ; k < repeat; ++k) { timer.reset(); for(size_t i = 0; i <N; ++i) { int ri = xor128() % N; begin[ri] = static_cast<T>( i ); } volatile T val = 1; for(size_t i = 0; i <N; ++i) { int ri = xor128() % N; val += begin[ri] * val + 33; } int time = timer.split(); sumt += time; //cout<<val; } //cout<<endl; cout<<" average time for offset "<<(offset%sizeof(T))<<" is "<<sumt * 1.0 /repeat<<endl; } } int main() { runtest<uint16_t>(); cout<<endl; runtest<uint32_t>(); cout<<endl; runtest<uint64_t>(); cout<<endl; return 0; }
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers