This is a kind of FYI-posting, summing up findings and code that I produced.
I've been investigating the enginefilterbutterworth8 code because the EQ
doesn't sound as expected, and it consumes a lot of cpu (as known).
The reason for the sound is that engFilButterworth works (in terms of
sound equalizing) radically different than expected; it's basically a
frequency dividing network with 8th order Butterworth filters (known for
phase and group delay problems) giving three channels that are
summed/mixed. No surprise 4 filters of 8th order consume a lot of cycles...
A look at the actual code made me wonder why the calculation is
implemented double precision, while CSAMPE is float, and there's an ugly
memmove per step.
Attached you can find my drop-in replacement code. There are two
versions in the file: the first c implementation basically is the
current stuff rewritten, omitting the memmove per step and using float,
probably runnable on any platform.
The second version is a SSE implementation that works on gcc 4.3.3
(64bit, probably identical on 32bit). It assumes parameters in xmm0, rdi
and rsi.
There's quite some optimization potential, because xxxPS ops could
calculate 4 values in parallel, while currently xxxSS ops calculate a
single value.
Regards,
Andreas
#include "filterstuff.h"
/*
float assumptions(const float *coef, float *buf, register float val)
{
val is in xmm0
buf is in rsi
coef is in rdi
return value in xmm0
}
// xmm0 - var
// xmm1 - iir and tmp
// xmm2 - b0
// xmm3 - b1
#define BLOCKADD(i) asm( \
"movss " i "+0(%rsi), %xmm2; #b0 \n\t" \
"movss " i "+4(%rsi), %xmm3; #b1 \n\t" \
"movss " i "+4(%rdi), %xmm1; #c1 \n\t" \
"mulss %xmm2, %xmm1; \n\t" \
"subss %xmm1, %xmm0; \n\t" \
"movss " i "+8(%rdi), %xmm1; #c2 \n\t" \
"mulss %xmm3, %xmm1; \n\t" \
"subss %xmm1, %xmm0; \n\t" \
"movaps %xmm0, %xmm1; #iir \n\t" \
"addss %xmm2, %xmm0; \n\t" \
"addss %xmm3, %xmm0; \n\t" \
"addss %xmm3, %xmm0; #val \n\t" \
"movss %xmm3, " i "+0(%rsi); \n\t" \
"movss %xmm1, " i "+4(%rsi); ");
#define BLOCKSUB(i) asm( \
"movss " i "+0(%rsi), %xmm2; #b0 \n\t" \
"movss " i "+4(%rsi), %xmm3; #b1 \n\t" \
"movss " i "+4(%rdi), %xmm1; #c1 \n\t" \
"mulss %xmm2, %xmm1; \n\t" \
"subss %xmm1, %xmm0; \n\t" \
"movss " i "+8(%rdi), %xmm1; #c2 \n\t" \
"mulss %xmm3, %xmm1; \n\t" \
"subss %xmm1, %xmm0; \n\t" \
"movaps %xmm0, %xmm1; #iir \n\t" \
"addss %xmm2, %xmm0; \n\t" \
"subss %xmm3, %xmm0; \n\t" \
"subss %xmm3, %xmm0; #val \n\t" \
"movss %xmm3, " i "+0(%rsi); \n\t" \
"movss %xmm1, " i "+4(%rsi); ");
#define BLOCKEND asm( \
"movss (%rdi), %xmm1; #c0 \n\t" \
"mulss %xmm1, %xmm0; \n\t" \
"leave; \n\t" \
"ret; ");
#if 1
float doHighpass(const float *coef, float *buf, register float val)
{
BLOCKSUB("0")
BLOCKSUB("8")
BLOCKSUB("16")
BLOCKSUB("24")
BLOCKEND
return val;
}
float doLowpass(const float *coef, float *buf, register float val)
{
BLOCKADD("0")
BLOCKADD("8")
BLOCKADD("16")
BLOCKADD("24")
BLOCKEND
return val;
}
float doBandpass(const float *coef, float *buf, register float val)
{
BLOCKSUB("0")
BLOCKSUB("8")
BLOCKSUB("16")
BLOCKSUB("24")
BLOCKADD("32")
BLOCKADD("40")
BLOCKADD("48")
BLOCKADD("56")
BLOCKEND
return val;
}
#else
#define FSUB(c1, c2, b0, b1) \
iir = val - c1*b0 - c2*b1; \
val = iir + b0 -b1-b1; \
b0 = b1; \
b1 = iir;
#define FADD(c1, c2, b0, b1) \
iir = val - c1*b0 - c2*b1; \
val = iir + b0 +b1+b1; \
b0 = b1; \
b1 = iir;
float doBandpass(const float *coef, float *buf, register float val)
{
register float iir;
FSUB(coef[1], coef[2], buf[0], buf[1]);
FSUB(coef[3], coef[4], buf[2], buf[3]);
FSUB(coef[5], coef[6], buf[4], buf[5]);
FSUB(coef[7], coef[8], buf[6], buf[7]);
FADD(coef[9], coef[10], buf[8], buf[9]);
FADD(coef[11], coef[12], buf[10], buf[11]);
FADD(coef[13], coef[14], buf[12], buf[13]);
FADD(coef[15], coef[16], buf[14], buf[15]);
return val * coef[0];
}
float doLowpass(const float *coef, float *buf, register float val)
{
register float iir;
FADD(coef[1], coef[2], buf[0], buf[1]);
FADD(coef[3], coef[4], buf[2], buf[3]);
FADD(coef[5], coef[6], buf[4], buf[5]);
FADD(coef[7], coef[8], buf[6], buf[7]);
return val * coef[0];
}
float doHighpass(const float *coef, float *buf, register float val)
{
register float iir;
FSUB(coef[1], coef[2], buf[0], buf[1]);
FSUB(coef[3], coef[4], buf[2], buf[3]);
FSUB(coef[5], coef[6], buf[4], buf[5]);
FSUB(coef[7], coef[8], buf[6], buf[7]);
return val * coef[0];
}
#endif
------------------------------------------------------------------------------
Crystal Reports - New Free Runtime and 30 Day Trial
Check out the new simplified licensing option that enables
unlimited royalty-free distribution of the report engine
for externally facing server and web deployment.
http://p.sf.net/sfu/businessobjects
_______________________________________________
Mixxx-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/mixxx-devel