would someone please try this, I don't have a x64 machine right now :(

static char *conv_3(char *p, uint32_t magnitude)
{
   do {
       *--p = (char) '0' + (magnitude % 10);
   } while ((magnitude /= 10) != 0);

   return p;
}

on x86-32bit I got this:

conv_1
cycles:           64
cycles:           47
cycles:           47
cycles:           18
cycles:           18
cycles:           18
cycles:           18
cycles:           18
conv_2
cycles:           62
cycles:           77
cycles:           62
cycles:           18
cycles:           18
cycles:           18
cycles:           18
cycles:           18
conv_3
cycles:           62
cycles:           40
cycles:           40
cycles:           18
cycles:           18
cycles:           18
cycles:           18
cycles:           18

btw, I get an order of magnitude lower numbers than you on my machine,
am I doing something wrong?
gcc version 4.1.2
/proc/cpuinfo:
model name   : AMD Athlon(tm) 64 Processor 2800+
cpu MHz        : 1800.000
cache size      : 512 KB
flags              : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr
pge mca cmov pat pse36 clflush mmx fxsr sse sse2 syscall nx mmxext lm
3dnowext 3dnow up ts fid vid ttp
bogomips       : 3609.09
clflush size     : 64



--
Lucian Adrian Grijincu

On 4/27/07, Davi Arnaut <[EMAIL PROTECTED]> wrote:
On 27/04/2007, at 00:14, Lucian Adrian Grijincu wrote:

> in apr-conv10-faster.patch you added:
>
> static const char digits[] = "0123456789";
> *--p = digits[magnitude % 10];
>
> Why is this faster than:
> *--p = (char) '0' + (magnitude % 10); ?

You have to take into account the entire loop. The fowling:

do {
        u_widest_int new_magnitude = magnitude / 10;
        *--p = (char) (magnitude - new_magnitude * 10 + '0');
        magnitude = new_magnitude;
} while (magnitude);

against:

do {
        *--p = digits[magnitude % 10];
} while ((magnitude /= 10) != 0);

digits is easily cacheable, fewer assignments.

>
> For your "faster" version, under the hood, the C compiler adds
> (magnitude % 10) to the address of digits and then copies the contents
> of the memory location represented by the sum's result into *--p.
>
> My version just adds (magnitude % 10) to '0' and stores the result
> in *--p.

Talk is cheap, let's benchmark! To see the generated assembly:

gcc -O2 -o bench bench.c -g
objdump -S bench > bench-asm

# Intel(R) Celeron(R) CPU 2.20GHz
[EMAIL PROTECTED] ~]$ gcc -o bench bench.c -O2 # uint32_t
[EMAIL PROTECTED] ~]$ ./bench $RANDOM
conv_1
cycles:          236
cycles:          236
cycles:          236
cycles:          236
cycles:          236
cycles:          236
cycles:          236
cycles:          236
conv_2
cycles:          236
cycles:          220
cycles:          224
cycles:          220
cycles:          224
cycles:          224
cycles:          224
cycles:          224
conv_1
cycles:          236
cycles:          236
cycles:          236
cycles:          236
cycles:          236
cycles:          236
cycles:          236
cycles:          236
conv_2
cycles:          220
cycles:          224
cycles:          224
cycles:          224
cycles:          224
cycles:          224
cycles:          224
cycles:          224

[EMAIL PROTECTED] ~]$ gcc -o bench bench.c -O2 # uint64_t
[EMAIL PROTECTED] ~]$ ./bench $RANDOM |more
conv_1
cycles:          508
cycles:          532
cycles:          540
cycles:          468
cycles:          468
cycles:          468
cycles:          468
conv_2
cycles:         1188
cycles:          824
cycles:          896
cycles:          828
cycles:          824
cycles:          824
cycles:          820
conv_1
cycles:          524
cycles:          492
cycles:          468
cycles:          504
cycles:          468
cycles:          504
cycles:          468
conv_2
cycles:          768
cycles:          836
cycles:          836
cycles:          820
cycles:          820
cycles:          820
cycles:          820


> Am I missing something here?

Both code, after compiler optimizations, yield similar results but
hurts uint64_t (apr_uint64_t) case quite a bit. "Faster" was a
overstatement, I withdraw apr-conv10-faster.patch.

--
Davi Arnaut



#include <signal.h>
#include <stdio.h>
#include <time.h>
#include <stdint.h>

#define rdtscll(val) __asm__ __volatile__("rdtsc" : "=A" (val))

static const char digits[] = "0123456789";

static char *conv_1(char *p, uint32_t magnitude)
{
     do {
        uint32_t new_magnitude = magnitude / 10;
        *--p = (char) (magnitude - new_magnitude * 10 + '0');
        magnitude = new_magnitude;
    } while (magnitude);

    return p;
}

static char *conv_2(char *p, uint32_t magnitude)
{
    do {
        *--p = digits[magnitude % 10];
    } while ((magnitude /= 10) != 0);

    return p;
}

static char *conv_3(char *p, uint32_t magnitude)
{
    do {
        *--p = (char) '0' + (magnitude % 10);
    } while ((magnitude /= 10) != 0);

    return p;
}

static void bench_1(unsigned int iter, uint32_t num)
{
    char buf[512], *p = buf+512;
    unsigned long long ts, te;

    puts("conv_1");

    while (iter--) {
        rdtscll(ts);
        conv_1(buf, num);
        rdtscll(te);
        printf("cycles: %12llu\n", te - ts);
    }
}

static void bench_2(unsigned int iter, uint32_t num)
{
    char buf[512], *p = buf+512;
    unsigned long long ts, te;

    puts("conv_2");

    while (iter--) {
        rdtscll(ts);
        conv_2(p, num);
        rdtscll(te);
        printf("cycles: %12llu\n", te - ts);
    }
}

static void bench_3(unsigned int iter, uint32_t num)
{
    char buf[512], *p = buf+512;
    unsigned long long ts, te;

    puts("conv_3");

    while (iter--) {
        rdtscll(ts);
        conv_3(p, num);
        rdtscll(te);
        printf("cycles: %12llu\n", te - ts);
    }
}

int main(int argc, char *argv[])
{
    uint32_t num = atoi(argv[1]);

    bench_1(8, num);
    bench_2(8, num);
    bench_3(8, num);
    bench_1(8, num);
    bench_2(8, num);
    bench_3(8, num);
}

Reply via email to