This is a kind of FYI-posting, summing up findings and code that I produced.

I've been investigating the enginefilterbutterworth8 code because the EQ doesn't sound as expected, and it consumes a lot of cpu (as known).

The reason for the sound is that engFilButterworth works (in terms of sound equalizing) radically different than expected; it's basically a frequency dividing network with 8th order Butterworth filters (known for phase and group delay problems) giving three channels that are summed/mixed. No surprise 4 filters of 8th order consume a lot of cycles...

A look at the actual code made me wonder why the calculation is implemented double precision, while CSAMPE is float, and there's an ugly memmove per step.

Attached you can find my drop-in replacement code. There are two versions in the file: the first c implementation basically is the current stuff rewritten, omitting the memmove per step and using float, probably runnable on any platform.

The second version is a SSE implementation that works on gcc 4.3.3 (64bit, probably identical on 32bit). It assumes parameters in xmm0, rdi and rsi.

There's quite some optimization potential, because xxxPS ops could calculate 4 values in parallel, while currently xxxSS ops calculate a single value.

Regards,
Andreas
#include "filterstuff.h"


/* 

float assumptions(const float *coef, float *buf, register float val)
{
   val is in xmm0
   buf is in rsi
   coef is in rdi
   
   return value in xmm0
}
 
// xmm0 - var
// xmm1 - iir and tmp
// xmm2 - b0
// xmm3 - b1


#define BLOCKADD(i)  asm(				\
"movss	" i "+0(%rsi), %xmm2;	#b0	\n\t"	\
"movss	" i "+4(%rsi), %xmm3;	#b1	\n\t"	\
"movss	" i "+4(%rdi), %xmm1;	#c1	\n\t"	\
"mulss	%xmm2, %xmm1;			\n\t"	\
"subss	%xmm1, %xmm0; 			\n\t"	\
"movss	" i "+8(%rdi), %xmm1;	#c2	\n\t"	\
"mulss	%xmm3, %xmm1; 			\n\t"	\
"subss	%xmm1, %xmm0; 			\n\t"	\
"movaps	%xmm0, %xmm1;		#iir	\n\t"	\
"addss	%xmm2, %xmm0; 			\n\t"	\
"addss	%xmm3, %xmm0; 			\n\t"	\
"addss	%xmm3, %xmm0;		#val	\n\t"	\
"movss	%xmm3, " i "+0(%rsi); 		\n\t"	\
"movss	%xmm1, " i "+4(%rsi);	");

#define BLOCKSUB(i)  asm(				\
"movss	" i "+0(%rsi), %xmm2;	#b0	\n\t"	\
"movss	" i "+4(%rsi), %xmm3;	#b1	\n\t"	\
"movss	" i "+4(%rdi), %xmm1;	#c1	\n\t"	\
"mulss	%xmm2, %xmm1;			\n\t"	\
"subss	%xmm1, %xmm0; 			\n\t"	\
"movss	" i "+8(%rdi), %xmm1;	#c2	\n\t"	\
"mulss	%xmm3, %xmm1; 			\n\t"	\
"subss	%xmm1, %xmm0; 			\n\t"	\
"movaps	%xmm0, %xmm1;		#iir	\n\t"	\
"addss	%xmm2, %xmm0; 			\n\t"	\
"subss	%xmm3, %xmm0; 			\n\t"	\
"subss	%xmm3, %xmm0;		#val	\n\t"	\
"movss	%xmm3, " i "+0(%rsi); 		\n\t"	\
"movss	%xmm1, " i "+4(%rsi);	");


#define BLOCKEND	asm(					\
"movss	(%rdi), %xmm1;			#c0	\n\t"	\
"mulss	%xmm1, %xmm0;			\n\t"	\
"leave;							\n\t"	\
"ret;								");

				

#if 1

float doHighpass(const float *coef, float *buf, register float val)
{
   BLOCKSUB("0")
   BLOCKSUB("8")
   BLOCKSUB("16")
   BLOCKSUB("24")
   BLOCKEND
   
   return val;
}

float doLowpass(const float *coef, float *buf, register float val)
{
   BLOCKADD("0")
   BLOCKADD("8")
   BLOCKADD("16")
   BLOCKADD("24")
   BLOCKEND
   
   return val;
}

float doBandpass(const float *coef, float *buf, register float val)
{
   BLOCKSUB("0")
   BLOCKSUB("8")
   BLOCKSUB("16")
   BLOCKSUB("24")
   BLOCKADD("32")
   BLOCKADD("40")
   BLOCKADD("48")
   BLOCKADD("56")
   BLOCKEND

   return val;
}

#else

#define FSUB(c1, c2, b0, b1) 	\
   iir = val - c1*b0 - c2*b1;		\
   val = iir + b0 -b1-b1;		\
   b0 = b1;					\
   b1 = iir;

#define FADD(c1, c2, b0, b1) 	\
   iir = val - c1*b0 - c2*b1;		\
   val = iir + b0 +b1+b1;		\
   b0 = b1;					\
   b1 = iir;

float doBandpass(const float *coef, float *buf, register float val)
{
   register float iir;

   FSUB(coef[1],		coef[2], 	buf[0],	buf[1]);
   FSUB(coef[3], 	coef[4], 	buf[2],	buf[3]);
   FSUB(coef[5], 	coef[6], 	buf[4],	buf[5]);
   FSUB(coef[7], 	coef[8], 	buf[6],	buf[7]);
   FADD(coef[9], 	coef[10], 	buf[8],	buf[9]);
   FADD(coef[11],	coef[12], 	buf[10],	buf[11]);
   FADD(coef[13],	coef[14], 	buf[12],	buf[13]);
   FADD(coef[15],	coef[16], 	buf[14],	buf[15]);

   return val * coef[0];
}

float doLowpass(const float *coef, float *buf, register float val)
{
   register float iir;

   FADD(coef[1],		coef[2], 	buf[0],	buf[1]);
   FADD(coef[3], 	coef[4], 	buf[2],	buf[3]);
   FADD(coef[5], 	coef[6], 	buf[4],	buf[5]);
   FADD(coef[7], 	coef[8], 	buf[6],	buf[7]);

   return val * coef[0];
}

float doHighpass(const float *coef, float *buf, register float val)
{
   register float iir;
   
   FSUB(coef[1],		coef[2], 	buf[0],	buf[1]);
   FSUB(coef[3], 	coef[4], 	buf[2],	buf[3]);
   FSUB(coef[5], 	coef[6], 	buf[4],	buf[5]);
   FSUB(coef[7], 	coef[8], 	buf[6],	buf[7]);

   return val * coef[0];
}
#endif

------------------------------------------------------------------------------
Crystal Reports - New Free Runtime and 30 Day Trial
Check out the new simplified licensing option that enables 
unlimited royalty-free distribution of the report engine 
for externally facing server and web deployment. 
http://p.sf.net/sfu/businessobjects
_______________________________________________
Mixxx-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/mixxx-devel

Reply via email to