Here you have the second round.

I think to have enough data to affirm the following:

1) current hand made asm has some serious bug about both correctness and
efficiency
2) naive approach has a non marginal failure rate 
3) server-less approach is much less efficient than server based
approach

Take in consideration these results:
$ ./sum 2048 8 16384
CPU clock: 1460475899.426625
mix_areas0: 87191 0.032139%
mix_areas1: 145666 0.053692% (365)
mix_areas2: 3034611 1.118555% (1217)
mix_areas3: 327412 0.120684% (0)

The server based approach needs about 0.03% of CPU power to mix one
stream stereo s16 @44100 Hz.
The fastest server-less approach I'm now able to invent needs 0.12%.

It's not a big fraction, but I think that to have a lot of machine power
available is never an excuse to waste it.

The naive approach is much better but with 8 random mixed streams it has
near 18% probability to give wrong results (with an average power for
each stream of 25%).

I'd suggest to use dmix like approach for pcm_share, pcm_snoop and for
the sum part of pcm_mix, but to use a separate thread for saturate,
transfer to hardware and silence.

I hope this will be useful for ALSA to take the right path.

P.S. I was almost forgetting how much is enjoyable to work for ALSA ;-)

-- 
Abramo Bagnara                       mailto:[EMAIL PROTECTED]

Opera Unica                          Phone: +39.546.656023
Via Emilia Interna, 140
48014 Castel Bolognese (RA) - Italy
#include <stdlib.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/time.h>

#define rdtscll(val) \
     __asm__ __volatile__("rdtsc" : "=A" (val))

#define likely(x)       __builtin_expect((x),1)
#define unlikely(x)     __builtin_expect((x),0)

typedef short int s16;
typedef int s32;

#ifdef CONFIG_SMP
#define LOCK_PREFIX "lock ; "
#else
#define LOCK_PREFIX ""
#endif

struct __xchg_dummy { unsigned long a[100]; };
#define __xg(x) ((struct __xchg_dummy *)(x))

static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
                                      unsigned long new, int size)
{
        unsigned long prev;
        switch (size) {
        case 1:
                __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
                                     : "=a"(prev)
                                     : "q"(new), "m"(*__xg(ptr)), "0"(old)
                                     : "memory");
                return prev;
        case 2:
                __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
                                     : "=a"(prev)
                                     : "q"(new), "m"(*__xg(ptr)), "0"(old)
                                     : "memory");
                return prev;
        case 4:
                __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2"
                                     : "=a"(prev)
                                     : "q"(new), "m"(*__xg(ptr)), "0"(old)
                                     : "memory");
                return prev;
        }
        return old;
}

#define cmpxchg(ptr,o,n)\
        ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
                                        (unsigned long)(n),sizeof(*(ptr))))

static inline void atomic_add(volatile int *dst, int v)
{
        __asm__ __volatile__(
                LOCK_PREFIX "addl %1,%0"
                :"=m" (*dst)
                :"ir" (v), "m" (*dst));
}


static double
detect_cpu_clock()
{
        struct timeval tm_begin, tm_end;
        unsigned long long tsc_begin, tsc_end;

        /* Warm cache */
        gettimeofday(&tm_begin, 0);

        rdtscll(tsc_begin);
        gettimeofday(&tm_begin, 0);

        usleep(1000000);

        rdtscll(tsc_end);
        gettimeofday(&tm_end, 0);

        return (tsc_end - tsc_begin) / (tm_end.tv_sec - tm_begin.tv_sec + 
(tm_end.tv_usec - tm_begin.tv_usec) / 1e6);
}

void mix_areas0(unsigned int size,
                const s16 *src,
                volatile s32 *sum,
                unsigned int src_step)
{
        while (size-- > 0) {
                atomic_add(sum, *src);
                src += src_step;
                sum++;
        }
}

void saturate(unsigned int size,
              s16 *dst, const s32 *sum,
              unsigned int dst_step)
{
        while (size-- > 0) {
                s32 sample = *sum;
                if (unlikely(sample < -0x8000))
                        *dst = -0x8000;
                else if (unlikely(sample > 0x7fff))
                        *dst = 0x7fff;
                else
                        *dst = sample;
                dst += dst_step;
                sum++;
        }
}

void mix_areas1(unsigned int size,
                volatile s16 *dst, const s16 *src,
                unsigned int dst_step, unsigned int src_step)
{
        while (size-- > 0) {
                s32 sample = *dst + *src;
                if (unlikely(sample < -0x8000))
                        *dst = -0x8000;
                else if (unlikely(sample > 0x7fff))
                        *dst = 0x7fff;
                else
                        *dst = sample;
                dst += dst_step;
                src += src_step;
        }
}

void mix_areas2(unsigned int size,
                volatile s16 *dst, const s16 *src,
                volatile s32 *sum, unsigned int dst_step,
                unsigned int src_step, unsigned int sum_step)
{
        /*
         *  ESI - src
         *  EDI - dst
         *  EBX - sum
         *  ECX - old sample
         *  EAX - sample / temporary
         *  EDX - size
         */
        __asm__ __volatile__ (
                "\n"

                /*
                 *  initialization, load EDX, ESI, EDI, EBX registers
                 */
                "\tmovl %0, %%edx\n"
                "\tmovl %1, %%edi\n"
                "\tmovl %2, %%esi\n"
                "\tmovl %3, %%ebx\n"

                /*
                 * while (size-- > 0) {
                 */
                "\tcmp $0, %%edx\n"
                "jz 6f\n"

                "1:"

                /*
                 *   sample = *src;
                 *   if (cmpxchg(*dst, 0, 1) == 0)
                 *     sample -= *sum;
                 *   xadd(*sum, sample);
                 */
                "\tmovw $0, %%ax\n"
                "\tmovw $1, %%cx\n"
                "\tlock; cmpxchgw %%cx, (%%edi)\n"
                "\tmovswl (%%esi), %%ecx\n"
                "\tjnz 2f\n"
                "\tsubl (%%ebx), %%ecx\n"
                "2:"
                "\tlock; addl %%ecx, (%%ebx)\n"

                /*
                 *   do {
                 *     sample = old_sample = *sum;
                 *     saturate(v);
                 *     *dst = sample;
                 *   } while (v != *sum);
                 */

                "3:"
                "\tmovl (%%ebx), %%ecx\n"
                "\tcmpl $0x7fff,%%ecx\n"
                "\tjg 4f\n"
                "\tcmpl $-0x8000,%%ecx\n"
                "\tjl 5f\n"
                "\tmovw %%cx, (%%edi)\n"
                "\tcmpl %%ecx, (%%ebx)\n"
                "\tjnz 3b\n"

                /*
                 * while (size-- > 0)
                 */
                "\tadd %4, %%edi\n"
                "\tadd %5, %%esi\n"
                "\tadd %6, %%ebx\n"
                "\tdecl %%edx\n"
                "\tjnz 1b\n"
                "\tjmp 6f\n"

                /*
                 *  sample > 0x7fff
                 */

                "4:"
                "\tmovw $0x7fff, %%ax\n"
                "\tmovw %%ax, (%%edi)\n"
                "\tcmpl %%ecx,(%%ebx)\n"
                "\tjnz 3b\n"
                "\tadd %4, %%edi\n"
                "\tadd %5, %%esi\n"
                "\tadd %6, %%ebx\n"
                "\tdecl %%edx\n"
                "\tjnz 1b\n"
                "\tjmp 6f\n"

                /*
                 *  sample < -0x8000
                 */

                "5:"
                "\tmovw $-0x8000, %%ax\n"
                "\tmovw %%ax, (%%edi)\n"
                "\tcmpl %%ecx, (%%ebx)\n"
                "\tjnz 3b\n"
                "\tadd %4, %%edi\n"
                "\tadd %5, %%esi\n"
                "\tadd %6, %%ebx\n"
                "\tdecl %%edx\n"
                "\tjnz 1b\n"
                // "\tjmp 6f\n"
                
                "6:"

                : /* no output regs */
                : "m" (size), "m" (dst), "m" (src), "m" (sum), "m" (dst_step), "m" 
(src_step), "m" (sum_step)
                : "esi", "edi", "edx", "ecx", "ebx", "eax"
        );
}


void mix_areas3(unsigned int size,
                volatile s16 *dst, const s16 *src,
                volatile s32 *sum,
                unsigned int dst_step, unsigned int src_step)
{
        while (size-- > 0) {
                s32 sample = *src;
                if (cmpxchg(dst, 0, 1) == 0)
                        sample -= *sum;
                atomic_add(sum, sample);
                do {
                        sample = *sum;
                        if (unlikely(sample < -0x8000))
                                *dst = -0x8000;
                        else if (unlikely(sample > 0x7fff))
                                *dst = 0x7fff;
                        else
                                *dst = sample;
                } while (unlikely(sample != *sum));
                sum++;
                dst += dst_step;
                src += src_step;
        }
}

int compare(const s16* b1, const s16 *b2, unsigned int size)
{
        unsigned int c = 0;
        while (size-- > 0) {
                if (*b1 != *b2)
                        c++;
                b1++;
                b2++;
        }
        return c;
}

int main(int argc, char **argv)
{
        int size = atoi(argv[1]);
        int n = atoi(argv[2]);
        int max = atoi(argv[3]);
        int i;
        unsigned long long begin, end;
        s16 *dst = malloc(sizeof(*dst) * size);
        s16 *check = malloc(sizeof(*check) * size);
        s32 *sum = malloc(sizeof(*sum) * size);
        s16 **srcs = malloc(sizeof(*srcs) * n);
        double cpu_clock = detect_cpu_clock();
        printf("CPU clock: %f\n", cpu_clock);
        for (i = 0; i < n; i++) {
                int k;
                s16 *s;
                srcs[i] = s = malloc(sizeof(s16) * size);
                for (k = 0; k < size; ++k, ++s) {
                        *s = (rand() % (max * 2)) - max;
                }
        }

        memset(sum, 0, sizeof(*sum) * size);
        rdtscll(begin);
        for (i = 0; i < n; i++) {
                mix_areas0(size, srcs[i], sum, 1);
        }
        saturate(size, check, sum, 1);
        rdtscll(end);
        printf("mix_areas0: %lld %f%%\n", end - begin, 100*2*44100.0*(end - 
begin)/(size*n*cpu_clock));

        memset(dst, 0, sizeof(*dst) * size);
        rdtscll(begin);
        for (i = 0; i < n; i++) {
                mix_areas1(size, dst, srcs[i], 1, 1);
        }
        rdtscll(end);
        printf("mix_areas1: %lld %f%% (%d)\n", end - begin, 100*2*44100.0*(end - 
begin)/(size*n*cpu_clock), compare(dst, check, size));

        memset(sum, 0, sizeof(*sum) * size);
        rdtscll(begin);
        for (i = 0; i < n; i++) {
                mix_areas2(size, dst, srcs[i], sum, 1, 1, 1);
        }
        rdtscll(end);
        printf("mix_areas2: %lld %f%% (%d)\n", end - begin, 100*2*44100.0*(end - 
begin)/(size*n*cpu_clock), compare(dst, check, size));

        memset(sum, 0, sizeof(*sum) * size);
        rdtscll(begin);
        for (i = 0; i < n; i++) {
                mix_areas3(size, dst, srcs[i], sum, 1, 1);
        }
        rdtscll(end);
        printf("mix_areas3: %lld %f%% (%d)\n", end - begin, 100*2*44100.0*(end - 
begin)/(size*n*cpu_clock), compare(dst, check, size));
        return 0;
}

Reply via email to