I read in the alpha reference manual (page (I) 4-145) that there is an
instruction wh64 that avoids the cachelines to be reloaded from memory
during large writes. This in theory should be ok for copy_page or
clear_page. I given a try with clear_page and wh64 instead only decrease
a lot the performances on a 21264.

-------------------------------------------------------------------
/*
 * Copyright (C) 2000 Andrea Arcangeli <[EMAIL PROTECTED]> SuSE
 *
 * Assembly clear_page for alpha (testing for wh64).
 */

#define __KERNEL__

#include <asm/page.h>
#include <asm/timex.h>
#include <asm/system.h>

#include <string.h>

#define NR_PAGES 3000

static inline void _clear_page(unsigned long page)
{
        unsigned long count = PAGE_SIZE/64;
        unsigned long *ptr = (unsigned long *)page;

        asm volatile("1:\n\t"

                     "wh64 (%1)\n\t"

                     "stq $31,0(%1)\n\t"
                     "stq $31,8(%1)\n\t"
                     "stq $31,16(%1)\n\t"
                     "stq $31,24(%1)\n\t"

                     "subq %0,1,%0\n\t"

                     "stq $31,32(%1)\n\t"
                     "stq $31,40(%1)\n\t"
                     "stq $31,48(%1)\n\t"
                     "stq $31,56(%1)\n\t"

                     "addq %1,64,%1\n\t"

                     "bne %0,1b" :
                     "=&r" (count), "=&r" (ptr) :
                     "0" (count), "1" (ptr));
}

static inline void _clear_page2(unsigned long page)
{
        unsigned long count = PAGE_SIZE/64;
        unsigned long *ptr = (unsigned long *)page;

        asm volatile("1:\n\t"
                     "stq $31,0(%1)\n\t"
                     "stq $31,8(%1)\n\t"
                     "stq $31,16(%1)\n\t"
                     "stq $31,24(%1)\n\t"

                     "subq %0,1,%0\n\t"

                     "stq $31,32(%1)\n\t"
                     "stq $31,40(%1)\n\t"
                     "stq $31,48(%1)\n\t"
                     "stq $31,56(%1)\n\t"

                     "addq %1,64,%1\n\t"

                     "bne %0,1b" :
                     "=&r" (count), "=&r" (ptr) :
                     "0" (count), "1" (ptr));
}

main()
{
        char * first = (char *) (((unsigned long )malloc(PAGE_SIZE*NR_PAGES) + 
~PAGE_MASK)&PAGE_MASK);
        char * page[NR_PAGES];
        cycles_t start, orig, wh64, other;
        int i;

        memset(first, 1, PAGE_SIZE*NR_PAGES);
        for (i = 0; i < NR_PAGES; i++)
                page[i] = first + PAGE_SIZE * i;

        start = get_cycles();
        for (i = 0; i < NR_PAGES/3; i++)
                clear_page((unsigned long)page[i]);
        orig = get_cycles();
        orig -= start;

        start = get_cycles();
        for (i = NR_PAGES/3; i < NR_PAGES/3*2; i++)
                _clear_page((unsigned long)page[i]);
        wh64 = get_cycles();
        wh64 -= start;

        start = get_cycles();
        for (i = NR_PAGES/3*2; i < NR_PAGES; i++)
                _clear_page2((unsigned long)page[i]);
        other = get_cycles();
        other -= start;

        printf("original %u, wh64 %u diff %d, other %d diff %d\n",
               orig, wh64, orig-wh64, other, orig-other);

        if (memcmp(page[0], page[NR_PAGES/3], PAGE_SIZE/3*NR_PAGES))
                printf("error wh64\n");
        if (memcmp(page[0], page[NR_PAGES/3*2], PAGE_SIZE/3*NR_PAGES))
                printf("error other\n");
}
-------------------------------------------------------------------

The output I get from my bench is:

original 7631627, wh64 9230262 diff -1598635, other 7579267 diff 52360
                                     ^^^^^^^ it's much slower than the
                                             original clear_page in C

Comments/hints?

Andrea

PS. The above proggy compiles only with a GAS with the binutils fix that
    I posted a few hours ago applyed.

Reply via email to