On Wed, 19 Feb 2003, Abramo Bagnara wrote: > The results are amazing and I'd say Jaroslav has done some mistakes in > his handmade asm.
I don't think so. It seems that my brain still remembers assembler ;-) You passed wrong values to my code so it did unaligned accesses. Fixes to make things same: --- sum.c 2003-02-19 18:55:20.000000000 +0100 +++ a.c 2003-02-19 19:31:00.000000000 +0100 @@ -11,6 +11,8 @@ typedef short int s16; typedef int s32; +#define CONFIG_SMP + #ifdef CONFIG_SMP #define LOCK_PREFIX "lock ; " #else @@ -54,7 +56,7 @@ static inline void atomic_add(volatile int *dst, int v) { __asm__ __volatile__( - LOCK_PREFIX "addl %0,%1" + LOCK_PREFIX "addl %1,%0" :"=m" (*dst) :"ir" (v)); } @@ -62,7 +64,9 @@ void mix_areas0(unsigned int size, volatile s16 *dst, s16 *src, volatile s32 *sum, - unsigned int dst_step, unsigned int src_step) + unsigned int dst_step, + unsigned int src_step, + unsigned int sum_step) { while (size-- > 0) { s32 sample = *dst + *src; @@ -70,8 +74,8 @@ *dst = sample > 0 ? 0x7fff : -0x8000; else *dst = sample; - dst += dst_step; - src += src_step; + ((char *)dst) += dst_step; + ((char *)src) += src_step; } } @@ -194,7 +198,9 @@ void mix_areas2(unsigned int size, volatile s16 *dst, s16 *src, volatile s32 *sum, - unsigned int dst_step, unsigned int src_step) + unsigned int dst_step, + unsigned int src_step, + unsigned int sum_step) { while (size-- > 0) { s32 sample = *src; @@ -204,15 +210,15 @@ do { sample = *sum; s16 s; - if (unlikely(sample & 0xffff0000)) + if (unlikely(sample & 0x7fff0000)) s = sample > 0 ? 0x7fff : -0x8000; else s = sample; *dst = s; } while (unlikely(sample != *sum)); - sum++; - dst += dst_step; - src += src_step; + ((char *)sum) += sum_step; + ((char *)dst) += dst_step; + ((char *)src) += src_step; } } @@ -236,19 +242,19 @@ } rdtscll(begin); for (i = 0; i < n; i++) { - mix_areas0(size, dst, srcs[i], sum, 1, 1); + mix_areas0(size, dst, srcs[i], sum, 2, 2, 4); } rdtscll(end); printf("mix_areas0: %lld\n", end - begin); rdtscll(begin); for (i = 0; i < n; i++) { - mix_areas1(size, dst, srcs[i], sum, 1, 1, 1); + mix_areas1(size, dst, srcs[i], sum, 2, 2, 4); } rdtscll(end); printf("mix_areas1: %lld\n", end - begin); rdtscll(begin); for (i = 0; i < n; i++) { - mix_areas2(size, dst, srcs[i], sum, 1, 1); + mix_areas2(size, dst, srcs[i], sum, 2, 2, 4); } rdtscll(end); printf("mix_areas2: %lld\n", end - begin); perex@pnote:~> cat /proc/cpuinfo processor : 0 vendor_id : GenuineIntel cpu family : 6 model : 8 model name : Pentium III (Coppermine) stepping : 6 cpu MHz : 847.473 cache size : 256 KB fdiv_bug : no hlt_bug : no f00f_bug : no coma_bug : no fpu : yes fpu_exception : yes cpuid level : 2 wp : yes flags : fpu vme de pse tsc msr pae mce cx8 sep mtrr pge mca cmov pat pse36 mmx fxsr sse bogomips : 1679.36 perex@pnote:~> ./a.out 2048 4 32267 mix_areas0: 170691 mix_areas1: 675795 mix_areas2: 708995 Have fun, Jaroslav ----- Jaroslav Kysela <[EMAIL PROTECTED]> Linux Kernel Sound Maintainer ALSA Project, SuSE Labs ------------------------------------------------------- This SF.net email is sponsored by: SlickEdit Inc. Develop an edge. The most comprehensive and flexible code editor you can use. Code faster. C/C++, C#, Java, HTML, XML, many more. FREE 30-Day Trial. www.slickedit.com/sourceforge _______________________________________________ Alsa-devel mailing list [EMAIL PROTECTED] https://lists.sourceforge.net/lists/listinfo/alsa-devel