I found an interesting paper improving index speed by prefetching memory
data to L1/L2 cache here (there is discussion about prefetching disk
data to memory several days ago "ice-breaker thread"):
http://www.cs.cmu.edu/~chensm/papers/index_pf_final.pdf

Also related technique used to speedup memcpy:
http://people.redhat.com/arjanv/pIII.c

I wonder if we could use it to speed up in-memory scan opertion for heap
or index. Tom's patch has made scan can handle a page (vs. row) every
time, which is a basis for this optimization.

I write a program try to simulate it, but I am not good at micro
optimization, and I just get a very weak but kind-of-stable improvement. I
wonder if any people here are interested to take a look.

Regards,
Qingqing

----------------------------------

Test results
--------------
Cache line size: 64
CPU: P4 2.4G
$#./prefetch 10 16
Sum: -951304192: with prefetch on - duration: 42.163 ms
Sum: -951304192: with prefetch off - duration: 42.838 ms
Sum: -951304192: with prefetch on - duration: 44.044 ms
Sum: -951304192: with prefetch off - duration: 42.792 ms
Sum: -951304192: with prefetch on - duration: 42.324 ms
Sum: -951304192: with prefetch off - duration: 42.803 ms
Sum: -951304192: with prefetch on - duration: 42.189 ms
Sum: -951304192: with prefetch off - duration: 42.801 ms
Sum: -951304192: with prefetch on - duration: 42.155 ms
Sum: -951304192: with prefetch off - duration: 42.827 ms
Sum: -951304192: with prefetch on - duration: 42.179 ms
Sum: -951304192: with prefetch off - duration: 42.798 ms
Sum: -951304192: with prefetch on - duration: 42.180 ms
Sum: -951304192: with prefetch off - duration: 42.804 ms
Sum: -951304192: with prefetch on - duration: 42.193 ms
Sum: -951304192: with prefetch off - duration: 42.827 ms
Sum: -951304192: with prefetch on - duration: 42.164 ms
Sum: -951304192: with prefetch off - duration: 42.810 ms
Sum: -951304192: with prefetch on - duration: 42.182 ms
Sum: -951304192: with prefetch off - duration: 42.826 ms

Test program
----------------

/*
 * prefetch.c
 *              PostgreSQL warm-cache sequential scan simulator with prefetch
 */

#include <stdio.h>
#include <stdlib.h>
#include <memory.h>
#include <errno.h>
#include <sys/time.h>

typedef char bool;
#define true    ((bool) 1)
#define false   ((bool) 0)

#define BLCKSZ  8192
#define CACHESZ 64
#define NBLCKS  5000

int     sum;

int
main(int argc, char *argv[])
{
        int     i, rounds;
        char    *blocks;
        int     cpu_cost;

        if (argc != 3)
        {
                fprintf(stderr, "usage: prefetch <rounds> <cpu_cost [1, 
16]>\n");
                exit(-1);
        }

        rounds = atoi(argv[1]);
        cpu_cost  = atoi(argv[2]);
        if (cpu_cost > 16)
                exit(-1);

        for (i = 0; i < 2*rounds; i++)
        {
                int     j, k;
                struct  timeval start_t, stop_t;
                bool    enable = i%2?false:true;
                char    *blck;

                blocks = (char *)malloc(BLCKSZ*NBLCKS);
                memset(blocks, 'a', BLCKSZ*NBLCKS);

                sum = 0;
                gettimeofday(&start_t, NULL);

                for (j = 0; j < NBLCKS; j++)
                {
                        blck = blocks + j*BLCKSZ;
                        for (k=0; k < BLCKSZ; k+=CACHESZ)
                        {
                                int     *s = (int *)(blck + k);
                                int     u = cpu_cost;

                                if (enable)
                                {
                                        /* prefetch ahead */
                                        __asm__ __volatile__ (
                                        "1: prefetchnta 128(%0)\n"
                                                : : "r" (s) : "memory" );
                                }

                                /* pretend to process current tuple */
                                while (u--) sum += (*(s+u))*(*(s+u));
                        }
                }
                gettimeofday(&stop_t, NULL);

                free(blocks);

                /* measure the time */
                if (stop_t.tv_usec < start_t.tv_usec)
                {
                        stop_t.tv_sec--;
                        stop_t.tv_usec += 1000000;
                }
                fprintf (stdout, "Sum: %d: with prefetch %s - duration: 
%ld.%03ld ms\n",
                                sum,
                                enable?"on":"off",
                                (long) ((stop_t.tv_sec - start_t.tv_sec) * 1000 
+
                                                (stop_t.tv_usec - 
start_t.tv_usec) / 1000),
                                (long) (stop_t.tv_usec - start_t.tv_usec) % 
1000);

        }

        exit(0);
}

---------------------------(end of broadcast)---------------------------
TIP 1: if posting/reading through Usenet, please send an appropriate
       subscribe-nomail command to [EMAIL PROTECTED] so that your
       message can get through to the mailing list cleanly

Reply via email to