Gentlemen:
Attached is a patch against Mixxx 1.8.1 to add an SSE3 optimized IIR
filter. I'll post a patch against trunk later.
In a test environment, this gives about a 70% speed boost on a
prescott processor (T2400 @ 1.83 GHz). When using valgrind/callgrind
with Mixxx 1.8.1 I measure a 36% speed boost.
IMPORTANT: You must compile with -DNDEBUG to realize the performance
boost.
I used this for CXXFLAGS:
-msse -msse2 -msse3 -O3 -ffast-math -funroll-loops -march=prescott -
mtune=native -DNDEBUG
You can download my test environment here:
http://gabe.is-a-geek.org/tmp/opt-filter-mixxx-7a7b8167.tar.bz2
Results from valgrind
=====================
EngineFilterIIR::process():
1.8.1: 453536195 Ir / 3632 Calls = 124872 Ir/call
patch: 216047832 Ir / 3864 Calls = 55942 Ir/call
CHANGE: -55.2 %
paV19Callback():
1.8.1: 696842371 Ir / 606 Calls = 1149905 Ir/call
patch: 475035892 Ir / 644 Calls = 737633 Ir/call
CHANGE: -35.9 %
Why does the 55% shrink to an overall 36%?? Not sure. It might be an
abstraction penalty for my implementation (d-pointer/cheshire cat).
Thanks,
Gabriel
src/engine/enginefilteriir.cpp | 1128 +++++++++++++++++++++++++++++++++++++---
src/engine/enginefilteriir.h | 9 +-
2 files changed, 1047 insertions(+), 90 deletions(-)
diff --git a/src/engine/enginefilteriir.cpp b/src/engine/enginefilteriir.cpp
index 702d77e..c99b722 100644
--- a/src/engine/enginefilteriir.cpp
+++ b/src/engine/enginefilteriir.cpp
@@ -2,6 +2,7 @@
enginefilteriir.cpp - description
-------------------
copyright : (C) 2002 by Tue and Ken Haste Andersen
+ (C) 2010 Gabriel M. Beddingfield <[email protected]>
email :
***************************************************************************/
@@ -15,109 +16,1070 @@
***************************************************************************/
#include "enginefilteriir.h"
+#include <cstring>
+#include <cassert>
+#include <memory>
-EngineFilterIIR::EngineFilterIIR(const double * pCoefs, int iOrder)
+#ifdef __SSE3__
+#define IIR_ENABLE_SSE3
+#else
+#warning "Using NON-SSE3 optimized version of filter"
+#endif
+
+#ifdef IIR_ENABLE_SSE3
+#include <xmmintrin.h> // SSE
+#include <emmintrin.h> // SSE2
+#include <pmmintrin.h> // SSE3
+#endif
+
+namespace DetailsEngineFilterIIR
{
- order = iOrder;
- coefs = pCoefs;
+
+#ifdef IIR_ENABLE_SSE3
+
+ static const unsigned short UNALIGNED = 4;
+
+ // Vector of 2 Double-precision Floats
+ typedef __m128d __v2df;
+ typedef union {
+ __v2df v;
+ double d[2];
+ } v2df;
+
+ typedef __m128 __v4sf;
+ typedef union {
+ __v4sf v;
+ float f[4];
+ } v4sf;
+
+ inline bool not_aligned_16(const void* ptr)
+ {
+ return (((intptr_t)ptr) % 16) != 0;
+ }
+
+ inline bool aligned_16(const void* ptr)
+ {
+ return (((intptr_t)ptr) % 16) == 0;
+ }
+
+ /* class DetailsEngineFilterIIR::FilterSSE<Order>
+ *
+ * For documentation, see the process() method.
+ */
+ template< short Order >
+ class FilterSSE : public EngineObject
+ {
+ public:
+ FilterSSE(const double *coefs);
+ ~FilterSSE();
+ void process(const CSAMPLE *pIn, const CSAMPLE *pOut, const int iBufferSize);
+ protected:
+ enum {
+ ORDER = Order
+ };
+ unsigned short _k;
+ const unsigned short _k_mask;
+ v2df * _gain;
+ v2df * _CXY[Order];
+ v2df *_xv; // Circular buffer
+ v2df *_yv; // Circular buffer, adjacent to _xv
+ std::auto_ptr<v2df> _memory;
+ };
+
+#endif // IIR_ENABLE_SSE3
+
+ class FilterReference : public EngineObject
+ {
+ public:
+ FilterReference(int order, const double* coefs);
+ ~FilterReference();
+ void process(const CSAMPLE *pIn, const CSAMPLE *pOut, const int iBufferSize);
+ protected:
+ int order;
+ const double *coefs;
+ enum {
+ MAXNZEROS=8,
+ MAXNPOLES=8
+ };
+ double xv1[MAXNZEROS+1], yv1[MAXNPOLES+1];
+ double xv2[MAXNZEROS+1], yv2[MAXNPOLES+1];
+ };
+
+} // namespace DetailsEngineFilterIIR
- // Reset the yv's:
- memset(yv1, 0, sizeof(yv1));
- memset(yv2, 0, sizeof(yv2));
- memset(xv1, 0, sizeof(xv1));
- memset(xv2, 0, sizeof(xv2));
+EngineFilterIIR::EngineFilterIIR(const double *pCoefs, int iOrder)
+{
+
+#ifdef IIR_ENABLE_SSE3
+ switch(iOrder) {
+ DetailsEngineFilterIIR::FilterSSE<2> *fo2;
+ DetailsEngineFilterIIR::FilterSSE<4> *fo4;
+ DetailsEngineFilterIIR::FilterSSE<8> *fo8;
+ case 2:
+ fo2 = new DetailsEngineFilterIIR::FilterSSE<2>(pCoefs);
+ _d = dynamic_cast<EngineObject*>(fo2);
+ break;
+ case 4:
+ fo4 = new DetailsEngineFilterIIR::FilterSSE<4>(pCoefs);
+ _d = dynamic_cast<EngineObject*>(fo4);
+ break;
+ case 8:
+ fo8 = new DetailsEngineFilterIIR::FilterSSE<8>(pCoefs);
+ _d = dynamic_cast<EngineObject*>(fo8);
+ break;
+ default:
+ assert(false);
+ }
+#else // IIR_ENABLE_SSE3
+ DetailsEngineFilterIIR::FilterReference *f;
+ f = new DetailsEngineFilterIIR::FilterReference(iOrder, pCoefs);
+ _d = dynamic_cast<EngineObject*>(f);
+#endif
+
+ assert(_d);
}
EngineFilterIIR::~EngineFilterIIR()
{
+ delete _d;
+ _d = 0;
}
void EngineFilterIIR::process(const CSAMPLE * pIn, const CSAMPLE * pOut, const int iBufferSize)
{
- CSAMPLE * pOutput = (CSAMPLE *)pOut;
- double GAIN = coefs[0];
- int i;
- for (i=0; i<iBufferSize; i+=2)
+ _d->process(pIn, pOut, iBufferSize);
+}
+
+namespace DetailsEngineFilterIIR
+{
+#ifdef IIR_ENABLE_SSE3
+ template< short Order >
+ FilterSSE<Order>::FilterSSE(const double *coefs) :
+ _k(0),
+ _k_mask(Order-1)
+ {
+ // ORDER must be a power of 2:
+ assert( (ORDER & (ORDER-1)) == 0 );
+
+ /* ALLOCATE ALIGNED MEMORY FOR INTERNAL VARIABLES
+ *
+ * SSE ops work best with 16-byte aligned memory. This
+ * allocates a large block of memory for _xv, _yv, and _CXY.
+ * These variables are assigned to adjacent memory.
+ *
+ * Since the address returned might not be properly aligned,
+ * we add extra padding so that we can ignore up to 15 bytes
+ * at the beginning. This ensures that our arrays are 16-byte
+ * aligned.
+ */
+ int mem_reqd = 2 * (ORDER*ORDER) + 2 * ORDER + 4; // mem for _CX, _CY, _xv, _yv, padding
+ _memory.reset( new v2df[mem_reqd] );
+ memset(_memory.get(), 0, mem_reqd * sizeof(v2df));
+
+ char* tmp = reinterpret_cast<char*>(_memory.get());
+ v2df* end = (v2df*)tmp + mem_reqd;
+ while( not_aligned_16(tmp) ) ++tmp;
+ v2df* beg = (v2df*)tmp;
+ assert( beg < end );
+
+ int k;
+ _gain = beg;
+ _CXY[0] = _gain + 1;
+ for( k=1 ; k<ORDER ; ++k ) _CXY[k] = _CXY[k-1] + (2*ORDER);
+ _xv = _CXY[ORDER-1] + (2*ORDER);
+ _yv = _xv + ORDER;
+ assert( _yv < end );
+
+ _gain->d[0] = coefs[0];
+ _gain->d[1] = coefs[0];
+
+ /* Initialize the _CX and _CY matrices
+ * See FilterSSE::process() for documentation on what this is doing.
+ */
+ const short turnaround = ORDER/2;
+ short c;
+ _CXY[0][0].d[0] = 1.0;
+ for( c=k=1 ; k <= turnaround ; ++k, ++c ) {
+ _CXY[0][k].d[0] = coefs[c];
+ }
+ for( c-=2 ; k<ORDER ; ++k, --c ) {
+ _CXY[0][k].d[0] = coefs[c];
+ }
+ assert( k == ORDER );
+ assert( c == 0 );
+
+ for( c=turnaround+1 ; k <= (2*ORDER) ; ++k, ++c ) {
+ _CXY[0][k].d[0] = coefs[c];
+ }
+
+ // Initialize the 2nd half of the vectors.
+ for( k=0 ; k<(2*ORDER) ; ++k ) {
+ _CXY[0][k].d[1] = _CXY[0][k].d[0];
+ }
+
+ // Pre-shuffle the coefficients
+ for( k=1 ; k < ORDER ; ++k ) { // Rows
+ for( c=0 ; c < ORDER ; ++c ) { // Cols
+ _CXY[k][c].v = _CXY[k-1][ _k_mask & (c-1) ].v;
+ _CXY[k][ORDER+c].v = _CXY[k-1][ ORDER + (_k_mask & (c-1)) ].v;
+ }
+ }
+
+ }
+
+ template< short Order >
+ FilterSSE<Order>::~FilterSSE()
+ {
+ }
+
+ /* Load the 4 packed-single floats at memory location `src` and
+ * store them in dest0 and dest1 as packed-doubles. `src` is
+ * assumed to be 16-byte aligned.
+ */
+ static inline void read_4_samples_aligned( v2df& dest0,
+ v2df& dest1,
+ const CSAMPLE* __restrict const src )
+ {
+ v4sf tmp;
+
+ assert( aligned_16(src) );
+ tmp.v = _mm_load_ps( src );
+ dest0.v = _mm_cvtps_pd( tmp.v );
+ tmp.v = _mm_movehl_ps( tmp.v, tmp.v );
+ dest1.v = _mm_cvtps_pd( tmp.v );
+ }
+
+ /* Load previous_leftover and src[0] into dest0 as a packed
+ * double. Load src[1] and src[2] into dest1 as a packed double.
+ * Load src[3] into previous_leftover. `src` is assumed to be
+ * 16-byte aligned.
+ */
+ static inline void read_4_samples_par_aligned( v2df& dest0,
+ v2df& dest1,
+ const CSAMPLE* __restrict const src,
+ double& previous_leftover )
+ {
+ v4sf tmp;
+
+ assert( aligned_16(src) );
+ tmp.v = _mm_load_ps( src );
+ dest0.d[0] = previous_leftover;
+ dest0.d[1] = (double)tmp.f[0];
+ dest1.d[0] = (double)tmp.f[1];
+ dest1.d[1] = (double)tmp.f[2];
+ previous_leftover = (double)tmp.f[3];
+ }
+
+ /* Load the 4 packed-single floats at memory location `src` and
+ * store them in dest0 and dest1 as packed-doubles. `src` is
+ * assumed to be unaligned.
+ */
+ static inline void read_4_samples_lame( v2df& dest0,
+ v2df& dest1,
+ const CSAMPLE* __restrict const src )
+ {
+ dest0.d[0] = (double)src[0];
+ dest0.d[1] = (double)src[1];
+ dest1.d[0] = (double)src[2];
+ dest1.d[1] = (double)src[3];
+ }
+
+ /* Write the 4 packed-double floats in src0 and src1 to `dest` as
+ * packed floats. `dest` is assumed to be 16-byte aligned.
+ */
+ static inline void write_4_samples_aligned( CSAMPLE * dest,
+ const v2df& src0,
+ const v2df& src1 )
+ {
+ v4sf tmp0, tmp1;
+ tmp0.v = _mm_cvtpd_ps( src0.v );
+ tmp1.v = _mm_cvtpd_ps( src1.v );
+ tmp0.v = _mm_movelh_ps( tmp0.v, tmp1.v );
+ _mm_store_ps( dest, tmp0.v );
+ }
+
+
+ /* In order for write_4_samples_par_aligned() to work, the value
+ * of `pending` must be initialized to have the leading unaligned
+ * data... while part of the data need to be written to `dest`.
+ * This function does that, according to the variable `alignment`.
+ * `dest` is assumed to have an alignment matching `alignment.`
+ * `dest` is assumed to NOT be 16-byte aligned.
+ *
+ * This function should not be used if alignment == 0. Otherwise,
+ * it should be used at the beginning of the inner loop.
+ *
+ * If alignment != UNALIGNED, then the number of bytes actually
+ * written will be (4-alignement). Also, the next `dest` address
+ * (dest + 4 - alignment) will be 16-byte aligned.
+ *
+ * If alignment == UNALIGNED, then 4 bytes will be written.
+ */
+ static inline void write_4_samples_par_aligned_init( CSAMPLE * dest,
+ const v2df& src0,
+ const v2df& src1,
+ const unsigned short alignment,
+ v4sf& pending )
+ {
+ switch(alignment) {
+ case 1:
+ *dest++ = (float)src0.d[0];
+ *dest++ = (float)src0.d[1];
+ *dest++ = (float)src1.d[0];
+ pending.f[0] = (float)src1.d[1];
+ break;
+ case 2:
+ *dest++ = (float)src0.d[0];
+ *dest++ = (float)src0.d[1];
+ pending.f[0] = (float)src1.d[0];
+ pending.f[1] = (float)src1.d[1];
+ break;
+ case 3:
+ *dest++ = (float)src0.d[0];
+ pending.f[0] = (float)src0.d[1];
+ pending.f[1] = (float)src1.d[0];
+ pending.f[2] = (float)src1.d[1];
+ break;
+ default:
+ (*dest++) = (float)src0.d[0];
+ (*dest++) = (float)src0.d[1];
+ (*dest++) = (float)src1.d[0];
+ (*dest++) = (float)src1.d[1];
+ }
+ }
+
+ /* Writes the first 4 double floats in pending[], src0, and src1
+ * as packed-single floats to `dest`. `dest` is assumed to be
+ * 16-byte aligned if alignment != UNALIGNED. The first float
+ * used will be pending.f[alignment]. The leftover floats will be
+ * written to pending.f[alignment-1].
+ *
+ * If alignment == 0 or aligment == UNALIGNED, then pending is not used.
+ */
+ static inline void write_4_samples_par_aligned( CSAMPLE * dest,
+ const v2df& src0,
+ const v2df& src1,
+ const unsigned short alignment,
+ v4sf& pending )
+ {
+ assert( (alignment == UNALIGNED) || (aligned_16(dest)) );
+ switch(alignment) {
+ case 0:
+ write_4_samples_aligned(dest, src0, src1);
+ break;
+ case 1:
+ pending.f[1] = (float)src0.d[0];
+ pending.f[2] = (float)src0.d[1];
+ pending.f[3] = (float)src1.d[0];
+ _mm_store_ps( dest, pending.v );
+ pending.f[0] = (float)src1.d[1];
+ break;
+ case 2: {
+ v4sf tmp0, tmp1;
+ tmp0.v = pending.v;
+ tmp1.v = _mm_cvtpd_ps( src0.v );
+ tmp0.v = _mm_movelh_ps( tmp0.v, tmp1.v );
+ _mm_store_ps( dest, tmp0.v );
+ tmp0.v = _mm_cvtpd_ps( src1.v );
+ _mm_store_ps( &pending.f[0], tmp0.v );
+ } break;
+ case 3:
+ pending.f[3] = (float)src0.d[0];
+ _mm_store_ps( dest, pending.v );
+ pending.f[0] = (float)src0.d[1];
+ pending.f[1] = (float)src1.d[0];
+ pending.f[2] = (float)src1.d[1];
+ break;
+ default:
+ (*dest++) = (float)src0.d[0];
+ (*dest++) = (float)src0.d[1];
+ (*dest++) = (float)src1.d[0];
+ (*dest++) = (float)src1.d[1];
+ }
+ }
+
+ /* This macro requires that these variables be defined in the
+ * local scope:
+ *
+ * const short Order;
+ * __m128d *xy, *XY;
+ * __m128d *CXY, *CXY_start;
+ * unsigned short k, _k_mask;
+ *
+ * Why a macro? Here's the other approaches:
+ *
+ * - COPY/PASTE: This yielded the fastest code, but
+ * makes this critical section harder to maintain.
+ *
+ * - FUNCTION: This has a significant performance
+ * penalty, so it's not an option.
+ *
+ * - INLINE FUNCTION: This actually has a significant
+ * performance penalty, too.
+ *
+ * - MACRO: While it's difficult to maintain a
+ * macro, it's better than COPY/PASTE. It's not
+ * quite as fast as COPY/PASTE, but it comes close.
+ *
+ * The MACRO was chosen as a compromise.
+ */
+ #define PROCESS_ONE_SAMPLE(in, out) \
+ { \
+ __m128d xmm0, xmm1, acc, x8; \
+ unsigned short iters; \
+ \
+ x8 = (in).v / GAIN; \
+ acc = x8; \
+ iters = 2*Order; \
+ xy = XY; \
+ /* Compute dot product */ \
+ while(iters--) { \
+ /* out.v += (*CXY).v * (*xy).v */ \
+ /* ++CXY; ++xy; */ \
+ xmm0 = _mm_load_pd((double*)CXY++); \
+ xmm1 = _mm_load_pd((double*)xy++); \
+ xmm0 = _mm_mul_pd(xmm0, xmm1); \
+ acc = _mm_add_pd(xmm0, acc); \
+ } \
+ _mm_store_pd( &(out).d[0], acc ); \
+ _mm_store_pd( (double*)(XY+k), x8 ); \
+ _mm_store_pd( (double*)(XY+k+Order), acc ); \
+ \
+ ++k; \
+ k &= _k_mask; \
+ if( k == 0 ) \
+ CXY = CXY_start; \
+ }
+
+
+ /* Runs inner loop of FilterSSE<Order>::process(), optimized for
+ * `*dest_` and `src` both being 16-byte aligned.
+ *
+ * Returns the value of CXY for the next iteration.
+ */
+ template <short Order>
+ static inline __m128d* inner_loop_aligned_4_stride( CSAMPLE** dest_,
+ const CSAMPLE* src /* begin */,
+ const CSAMPLE* src_end,
+ unsigned short& k,
+ const unsigned short _k_mask,
+ const __m128d GAIN,
+ __m128d * const XY,
+ __m128d * CXY,
+ __m128d * const CXY_start,
+ __m128d * const CXY_end )
+ {
+ __m128d *xy;
+ CSAMPLE *dest = (*dest_);
+
+ assert( aligned_16(src) );
+ assert( aligned_16(dest) );
+ assert( aligned_16(src_end) );
+
+ for( ; src != src_end ; src += 4 )
+ {
+ v2df in0, in1, out0, out1;
+ read_4_samples_aligned(in0, in1, src);
+ PROCESS_ONE_SAMPLE(in0, out0);
+ PROCESS_ONE_SAMPLE(in1, out1);
+ write_4_samples_aligned(dest, out0, out1);
+ dest += 4;
+ }
+ (*dest_) = dest;
+ return CXY;
+ }
+
+ /* Runs inner loop of FilterSSE<Order>::process(), optimized for
+ * `*dest_` and `src` both being 4-byte aligned. However, if they
+ * are both 16-byte aligned, you should use
+ * inner_loop_aligned_4_stride().
+ *
+ * Returns the value of CXY for the next iteration.
+ */
+ template <short Order>
+ static inline __m128d* inner_loop_par_aligned_4_stride( CSAMPLE** dest_,
+ const CSAMPLE* src /* begin */,
+ const CSAMPLE* src_end,
+ unsigned short& k,
+ const unsigned short _k_mask,
+ const __m128d GAIN,
+ __m128d * const XY,
+ __m128d * CXY,
+ __m128d * const CXY_start,
+ __m128d * const CXY_end,
+ const unsigned short alignment_in,
+ const unsigned short alignment_out,
+ double& leftover_in,
+ v4sf& pending_ )
{
- if (order==8)
+ __m128d *xy;
+ v4sf pending = pending_;
+ CSAMPLE *dest = (*dest_);
+
+ assert( aligned_16(src) );
+ assert( (((intptr_t)dest) & 0x3) == 0 ); // 4-byte aligned
+ assert( aligned_16(src_end) );
+
+ /* first run */
{
- //8th order:
- // Channel 1
- xv1[0] = xv1[1]; xv1[1] = xv1[2]; xv1[2] = xv1[3]; xv1[3] = xv1[4];
- xv1[4] = xv1[5]; xv1[5] = xv1[6]; xv1[6] = xv1[7]; xv1[7] = xv1[8];
- xv1[8] = pIn[i]/GAIN;
- yv1[0] = yv1[1]; yv1[1] = yv1[2]; yv1[2] = yv1[3]; yv1[3] = yv1[4];
- yv1[4] = yv1[5]; yv1[5] = yv1[6]; yv1[6] = yv1[7]; yv1[7] = yv1[8];
- yv1[8] = (xv1[0] + xv1[8]) + coefs[1] * (xv1[1] + xv1[7]) +
- coefs[2] * (xv1[2] + xv1[6]) +
- coefs[3] * (xv1[3] + xv1[5]) + coefs[4] * xv1[4] +
- (coefs[5] * yv1[0]) + ( coefs[6] * yv1[1]) +
- (coefs[7] * yv1[2]) + ( coefs[8] * yv1[3]) +
- (coefs[9] * yv1[4]) + ( coefs[10] * yv1[5]) +
- (coefs[11] * yv1[6]) + ( coefs[12] * yv1[7]);
- Q_ASSERT(yv1[8]<100000 || yv1[8]>-100000);
- pOutput[i] = yv1[8];
-
- // Channel 2
- xv2[0] = xv2[1]; xv2[1] = xv2[2]; xv2[2] = xv2[3]; xv2[3] = xv2[4];
- xv2[4] = xv2[5]; xv2[5] = xv2[6]; xv2[6] = xv2[7]; xv2[7] = xv2[8];
- xv2[8] = pIn[i+1]/GAIN;
- yv2[0] = yv2[1]; yv2[1] = yv2[2]; yv2[2] = yv2[3]; yv2[3] = yv2[4];
- yv2[4] = yv2[5]; yv2[5] = yv2[6]; yv2[6] = yv2[7]; yv2[7] = yv2[8];
- yv2[8] = (xv2[0] + xv2[8]) + coefs[1] * (xv2[1] + xv2[7]) +
- coefs[2] * (xv2[2] + xv2[6]) +
- coefs[3] * (xv2[3] + xv2[5]) + coefs[4] * xv2[4] +
- (coefs[5] * yv2[0]) + ( coefs[6] * yv2[1]) +
- (coefs[7] * yv2[2]) + ( coefs[8] * yv2[3]) +
- (coefs[9] * yv2[4]) + ( coefs[10] * yv2[5]) +
- (coefs[11] * yv2[6]) + ( coefs[12] * yv2[7]);
- Q_ASSERT(yv2[8]<100000 || yv2[8]>-100000);
- pOutput[i+1] = yv2[8];
- }
- else if (order==2)
+ v2df in0, in1, out0, out1;
+ if( (alignment_in == 0) || (alignment_in == 2) ) {
+ read_4_samples_aligned(in0, in1, src);
+ } else {
+ read_4_samples_par_aligned(in0, in1, src, leftover_in);
+ }
+ PROCESS_ONE_SAMPLE(in0, out0);
+ PROCESS_ONE_SAMPLE(in1, out1);
+ write_4_samples_par_aligned_init(dest, out0, out1, alignment_out, pending);
+ dest += (4 - alignment_out);
+ assert( aligned_16(dest) );
+ src += 4;
+ }
+ for( ; src != src_end ; src += 4 )
{
- // Second order
- xv1[0] = xv1[1]; xv1[1] = xv1[2];
- xv1[2] = pIn[i] / GAIN;
- yv1[0] = yv1[1]; yv1[1] = yv1[2];
- yv1[2] = (xv1[0] + xv1[2]) + coefs[1] * xv1[1] + ( coefs[2] * yv1[0]) + (coefs[3] * yv1[1]);
- pOutput[i] = yv1[2];
-
- xv2[0] = xv2[1]; xv2[1] = xv2[2];
- xv2[2] = pIn[i+1] / GAIN;
- yv2[0] = yv2[1]; yv2[1] = yv2[2];
- yv2[2] = (xv2[0] + xv2[2]) + coefs[1] * xv2[1] + ( coefs[2] * yv2[0]) + (coefs[3] * yv2[1]);
- pOutput[i+1] = yv2[2];
- }
- else
+ v2df in0, in1, out0, out1;
+ if( (alignment_in == 0) || (alignment_in == 2) ) {
+ read_4_samples_aligned(in0, in1, src);
+ } else {
+ read_4_samples_par_aligned(in0, in1, src, leftover_in);
+ }
+ PROCESS_ONE_SAMPLE(in0, out0);
+ PROCESS_ONE_SAMPLE(in1, out1);
+ write_4_samples_par_aligned(dest, out0, out1, alignment_out, pending);
+ dest += 4;
+ }
+ (*dest_) = dest;
+ pending_ = pending;
+ return CXY;
+ }
+
+ /* Runs inner loop of FilterSSE<Order>::process(), assuming that
+ * `*dest_` or `src` are 1-byte aligned.
+ *
+ * Returns the value of CXY for the next iteration.
+ */
+ template <short Order>
+ static inline __m128d* inner_loop_lame_4_stride( CSAMPLE** dest_,
+ const CSAMPLE* src /* begin */,
+ const CSAMPLE* src_end,
+ unsigned short& k,
+ const unsigned short _k_mask,
+ const __m128d GAIN,
+ __m128d * const XY,
+ __m128d * CXY,
+ __m128d * const CXY_start,
+ __m128d * const CXY_end )
+ {
+ __m128d *xy;
+ CSAMPLE *dest = (*dest_);
+ v4sf pending = {{0, 0, 0, 0}};
+
+ for( ; src != src_end ; src += 4 )
{
- // Fourth order
- xv1[0] = xv1[1]; xv1[1] = xv1[2]; xv1[2] = xv1[3]; xv1[3] = xv1[4];
- xv1[4] = pIn[i] / GAIN;
- yv1[0] = yv1[1]; yv1[1] = yv1[2]; yv1[2] = yv1[3]; yv1[3] = yv1[4];
- yv1[4] = (xv1[0] + xv1[4]) + coefs[1]*(xv1[1]+xv1[3]) + coefs[2] * xv1[2]
- + ( coefs[3] * yv1[0]) + ( coefs[4] * yv1[1])
- + ( coefs[5] * yv1[2]) + ( coefs[6] * yv1[3]);
- pOutput[i] = yv1[4];
+ v2df in0, in1, out0, out1;
- xv2[0] = xv2[1]; xv2[1] = xv2[2]; xv2[2] = xv2[3]; xv2[3] = xv2[4];
- xv2[4] = pIn[i+1] / GAIN;
- yv2[0] = yv2[1]; yv2[1] = yv2[2]; yv2[2] = yv2[3]; yv2[3] = yv2[4];
- yv2[4] = (xv2[0] + xv2[4]) + coefs[1]*(xv2[1]+xv2[3]) + coefs[2] * xv2[2]
- + ( coefs[3] * yv2[0]) + ( coefs[4] * yv2[1])
- + ( coefs[5] * yv2[2]) + ( coefs[6] * yv2[3]);
- pOutput[i+1] = yv2[4];
+ read_4_samples_lame(in0, in1, src);
+ PROCESS_ONE_SAMPLE(in0, out0);
+ PROCESS_ONE_SAMPLE(in1, out1);
+ write_4_samples_par_aligned(dest, out0, out1, UNALIGNED, pending);
+ dest += 4;
}
+
+ (*dest_) = dest;
+ return CXY;
}
-// Check for denormals
- for (i=0; i<=order; ++i)
+ /* DetailsEngineFilterIIR::FilterSSE<Order>::process()
+ *
+ * This is an SSE-optimized version of the IIR filter. It does
+ * the same calculations as FilterReference, but is organized to
+ * make it easy for the compiler to use SIMD instructions. If you
+ * look at FilterReference::process(), it's pretty clear what
+ * we're doing:
+ *
+ * - There is a rolling buffer of 9 'xv' coefficients.
+ * The current input sample is multiplied by a gain
+ * and added to the end of the buffer.
+ *
+ * - There is a rolling buffer of 9 'yv' coefficients.
+ * The yv[8] = fun(xv[0..8], yv[0..7], coefs[1..12])
+ *
+ * - Both LEFT and RIGHT channels are calculated independently,
+ * but in parallel.
+ *
+ * - Math uses double-precision floats because the
+ * filter becomes unstable using single-precision math.
+ *
+ * The primary optimizations for this implementation are:
+ *
+ * - Calculate Left and Right channels at the same time,
+ * using "Packed Single" floating point opts.
+ *
+ * - Eliminate the memory shuffle (xv1[0]=xv1[1]; xv1[1]=...)
+ * by using a ring-buffer.
+ *
+ * - To facilitate the ring-buffer, the coefficients are
+ * pre-shuffled into a matrix, and the correct row
+ * selected depending on the current position of the
+ * ring buffers.
+ *
+ * - The xv and yv ring buffers are adjacent in memory.
+ *
+ * - Since xv and yv are adjacent in memory, their
+ * coefficients are also adjacent in the form of
+ * a partitioned matrix.
+ *
+ * In the case of an 8th order filter, the calculations on each
+ * run are like this (in pseudocode):
+ *
+ * unsigned i; // sample position
+ * xv[0..7] = xv[1..8];
+ * xv[8] = in[i]/coefs[0];
+ * yv[0..7] = yv[1..8];
+ * yv[8] = xv[0] + coefs[1] * xv[1] + coefs[2] * xv[2] + coefs[3] * xv[3] + coefs[4] * xv[4]
+ * + coefs[3] * xv[5] + coefs[2] * xv[6] + coefs[1] * xv[7] + xv[8]
+ * + coefs[5] * yv[0] + coefs[6] * yv[1] + coefs[7] * yv[2] + coefs[8] * yv[3]
+ * + coefs[9] * yv[4] + coefs[10] * yv[5] + coefs[11] * yv[6] + coefs[12] * yv[7];
+ * out[i] = yv[8];
+ *
+ * If you remove xv[8] and yv[8], you see that we can rearrange
+ * the calculation into vector form:
+ *
+ * The rolling buffer:
+ *
+ * XY = [ xv[0] xv[1] xv[2] ... xv[7] yv[0] yv[1] ... yv[7] ]
+ *
+ * The coefficients (using c[] instead of coefs[]):
+ *
+ * CXY = [ 1.0 c[1] c[2] c[3] c[4] c[3] c[2] c[1] c[5] c[6] ... coefs[12] ]
+ *
+ * Now the calculations take a form like this:
+ *
+ * float *in, *out;
+ * xtmp = last_xtmp; ytmp = last_ytmp;
+ * for( i=0 ; i<=N ; ++i ) {
+ * shuffle(XY, xtmp, ytmp); //
+ * xtmp = in[i] / coefs[0];
+ * ytmp = xtmp + dot_product(XY, CXY);
+ * out[i] = ytmp
+ * }
+ *
+ * Notice that the shuffle is pre-initialized from last time. So,
+ * as long as things are initilized... we can move the shuffle to
+ * the /end/ of the loop and get the same results. Like this:
+ *
+ * float *in, *out;
+ * int i;
+ * if(first_run) { xtmp = 0.0; ytmp = 0.0 }
+ * for( i=0 ; i<=N ; ++i ) {
+ * xtmp = in[i] / coefs[0];
+ * ytmp = xtmp + dot_product(XY, CXY);
+ * out[i] = ytmp
+ * shuffle(XY, xtmp, ytmp);
+ * }
+ *
+ * The best way to optimize the shuffle speed is to *not* shuffle.
+ * Since the number of coefficients in the rolling buffer is
+ * the same as the order... and always a power of 2... a ring
+ * buffer is a good choice. So, we change XY into two partitioned
+ * ring buffers[1] (uxing x and y instead of xv and yv):
+ *
+ * int k; // Ring buffer position
+ * if(k>7) k = 0;
+ * if(k==0) XY = [ x[0] x[1] x[2] x[3] x[4] x[5] x[6] x[7] y[0] y[1] y[2] y[3] y[4] y[5] y[6] y[7] ]
+ * if(k==1) XY = [ x[7] x[0] x[1] x[2] x[3] x[4] x[5] x[6] y[7] y[0] y[1] y[2] y[3] y[4] y[5] y[6] ]
+ * if(k==2) XY = [ x[6] x[7] x[0] x[1] x[2] x[3] x[4] x[5] y[6] y[7] y[0] y[1] y[2] y[3] y[4] y[5] ]
+ * ...
+ * if(k==7) XY = [ x[1] x[2] x[3] x[4] x[5] x[6] x[7] x[0] y[1] y[2] y[3] y[4] y[5] y[6] y[7] y[0] ]
+ *
+ * But when we do this, we can no longer use the dot product.
+ * In order to keep using the dot product, we pre-shuffle the
+ * coefficients to match the current position in the ring buffer.
+ *
+ * CXY = [ [ 1 c[1] c[2] c[3] c[4] c[3] c[2] c[1] c[5] c[6] c[7] c[8] c[9] c[10] c[11] c[12] ]
+ * [ c[1] 1 c[1] c[2] c[3] c[4] c[3] c[2] c[12] c[5] c[6] c[7] c[8] c[9] c[10] c[11] ]
+ * [ c[2] c[1] 1 c[1] c[2] c[3] c[4] c[3] c[11] c[12] c[5] c[6] c[7] c[8] c[9] c[10] ]
+ * ...
+ * [ c[1] c[2] c[3] c[4] c[3] c[2] c[1] 1 c[6] c[7] c[8] c[9] c[10] c[11] c[12] c[5] ]
+ *
+ * Note that there are 8 rows. This changes our calculation to this:
+ *
+ * float *in, *out;
+ * int i, k;
+ * if(first_run) { xtmp = 0.0; ytmp = 0.0; }
+ * k = k_from_last_run;
+ * for( i=0 ; i<=N ; ++i ) {
+ * xtmp = in[i] / coefs[0];
+ * ytmp = xtmp + dot_product(XY, CXY[k]);
+ * out[i] = ytmp
+ * XY[k] = xtmp; XY[8+k] = ytmp;
+ * ++k;
+ * k &= 7; // see [2]
+ * }
+ *
+ * At this point, all we've done is optimize the shuffle. To get
+ * parallel caluclations, we make each x[] and y[] a VECTOR of
+ * size 2 (Left and Right). That's actually done with the union
+ * v2df (defined above). So, XY and CXY are arrays of v2df
+ * instead of double.
+ *
+ * FINALLY, SSE optimizations on x86 hardware requires that the
+ * memory addresses used be 16-byte aligned.[3][4] The next-best
+ * is to work with 4-byte alignments. However, with all the API's
+ * that Mixxx uses, we have no gaurantees on alighnment. This
+ * routine is optimized for 16-byte alignment, gives good
+ * performance with 4-byte aligment, and works with 1-byte
+ * alignment. To handle this, this function has 3 sections: the
+ * lead-in (to get to the next aligned input pointer), the inner
+ * loop (aligned strides over the length of the buffers), and the
+ * tail (taking care of unaligned data at the end of the buffers).
+ *
+ * [1] http://en.wikipedia.org/wiki/Circular_buffer
+ * [2] For buffers that have a size that is a power of 2
+ * (2, 4, 8, 16, 32, ...), these are equivalent:
+ *
+ * if( k >= N ) k = 0;
+ * k &= (N-1);
+ *
+ * [3] That is, ((intptr_t)ptr % 16) == 0
+ * [4] Actually, you can do SSE on unaligned data... but the
+ * performance penalty is usually large enough to defeat
+ * the purpose.
+ */
+ template< short Order >
+ void FilterSSE<Order>::process(const CSAMPLE * __restrict pIn,
+ const CSAMPLE * __restrict pOut,
+ const int iBufferSize) __restrict
{
- xv1[i] = zap_denormal(xv1[i]);
- yv1[i] = zap_denormal(yv1[i]);
- xv2[i] = zap_denormal(xv2[i]);
- yv2[i] = zap_denormal(yv2[i]);
+ CSAMPLE * pOutput = (CSAMPLE *)pOut;
+ const CSAMPLE * pInput = pIn;
+ int i;
+ unsigned short alignment_in;
+ unsigned short alignment_out;
+ double leftover_in = 0; // When alignement_in % 2 == 1
+
+ // These variables are for local copies of the object's
+ // current state.
+ unsigned short k; // ring buffer position
+ __m128d *CXY_start, *CXY_end;
+ __m128d * __restrict CXY;
+ __m128d XY[2*ORDER], *xy;
+ __m128d GAIN = _gain->v;
+
+ // Assuming that data is complete, interleaved (L,R) pairs
+ assert( (iBufferSize % 2) == 0 );
+ // Assuming that the buffers do not overlap. (__restrict)
+ assert( (pOut >= pInput + iBufferSize) || (pInput >= pOut + iBufferSize) );
+
+ /* POP THE CURRENT OBJECT STATE
+ *
+ * The state of the OBJECT is popped into local variables.
+ * This increases speed and reduces memory/cache slowdowns.
+ *
+ * xy: The ringbuffers are copied to the stack. This
+ * increases access speed and reduces cache misses.
+ *
+ * k: This needs to be a register variable, but if we
+ * use _k, then it will periodically save this->_k
+ * in memory whenever it changes. This is a big
+ * slow-down.
+ *
+ * CXY: Because _CXY is a pointer to a pointer, dereferencing
+ * _CXY[k] is actually very slow.
+ */
+ memcpy(XY, _xv, 2*ORDER*2*sizeof(double));
+ k = _k;
+ xy = XY;
+ assert(k < ORDER);
+ CXY_start = (__m128d*)_CXY[0];
+ CXY_end = (__m128d*)(_CXY[ORDER-1] + (2*ORDER));
+ CXY = (__m128d*)_CXY[k];
+
+ /* LEAD-IN AND ALIGNMENT CALCULATIONS
+ */
+ const CSAMPLE *pInput_end;
+ const int STRIDE = 4;
+
+ alignment_in = ((intptr_t) pInput) & 0xF;
+
+ if( 0 == (alignment_in % 4) ) {
+ alignment_in /= 4;
+ pInput_end = &pInput[ ((4-alignment_in)&0x3) + STRIDE * ((iBufferSize - alignment_in)/STRIDE) ];
+ assert( (iBufferSize - (intptr_t)(pInput_end - pInput)) < 4 );
+ } else {
+ alignment_in = UNALIGNED;
+ pInput_end = pInput + iBufferSize;
+ }
+
+ // If output isn't 4-byte aligned... it will stay unaligned.
+ // So there's no point in doing aligned stuff.
+ if( 0 != (((intptr_t)pOutput) & 0x3) ) {
+ alignment_in = UNALIGNED;
+ alignment_out = UNALIGNED;
+ pInput_end = pInput + iBufferSize;
+ }
+
+ // Run calculations once for all the unaligned samples
+ // at the beginning.
+ if(alignment_in == 3) {
+ leftover_in = *pInput++;
+ }
+
+ if((alignment_in == 1) || (alignment_in == 2)) {
+ v2df in, out;
+ unsigned short tmp;
+
+ in.d[0] = *pInput++;
+ in.d[1] = *pInput++;
+ if( alignment_in == 1 ) {
+ leftover_in = *pInput++;
+ }
+
+ PROCESS_ONE_SAMPLE(in,out);
+ assert( CXY < CXY_end );
+
+ (*pOutput++) = (float)out.d[0];
+ (*pOutput++) = (float)out.d[1];
+ }
+
+ alignment_out = ((intptr_t) pOutput) & 0xF;
+ if( 0 == (alignment_out % 4) ) {
+ alignment_out /= 4;
+ } else {
+ alignment_out = UNALIGNED;
+ }
+
+ /* INNER LOOP
+ */
+ v4sf pending;
+ if( (alignment_in != UNALIGNED) && (alignment_out != UNALIGNED) ) {
+ if( ((alignment_in & 1) == 0) && (alignment_out == 0) ) {
+ CXY = inner_loop_aligned_4_stride<Order>( &pOutput,
+ pInput,
+ pInput_end,
+ k,
+ _k_mask,
+ GAIN,
+ XY,
+ CXY,
+ CXY_start,
+ CXY_end );
+ } else {
+ CXY = inner_loop_par_aligned_4_stride<Order>( &pOutput,
+ pInput,
+ pInput_end,
+ k,
+ _k_mask,
+ GAIN,
+ XY,
+ CXY,
+ CXY_start,
+ CXY_end,
+ alignment_in,
+ alignment_out,
+ leftover_in,
+ pending );
+ }
+ } else {
+ CXY = inner_loop_lame_4_stride<Order>( &pOutput,
+ pInput,
+ pInput_end,
+ k,
+ _k_mask,
+ GAIN,
+ XY,
+ CXY,
+ CXY_start,
+ CXY_end );
+ }
+ pInput = pInput_end;
+
+ /* TAIL
+ *
+ * Run calculations for all unaligned samples at the end.
+ */
+ int leftover = iBufferSize - ((const CSAMPLE*)pInput - (const CSAMPLE*)pIn);
+ while( leftover > 0 ) {
+
+ v2df in, out;
+
+ switch(leftover) {
+ case 2:
+ in.d[0] = (double)(*pInput++);
+ in.d[1] = (double)(*pInput++);
+ break;
+ case 1:
+ in.d[0] = leftover_in;
+ in.d[1] = (double)(*pInput++);
+ break;
+ case 3:
+ in.d[0] = leftover_in;
+ in.d[1] = (double)(*pInput++);
+ leftover_in = (double)(*pInput++);
+ break;
+ default:
+ assert(false);
+ }
+ leftover -= 2;
+
+ PROCESS_ONE_SAMPLE(in, out);
+ assert( CXY < CXY_end );
+
+ switch(alignment_out) {
+ case 0:
+ case UNALIGNED:
+ (*pOutput++) = (float)out.d[0];
+ (*pOutput++) = (float)out.d[1];
+ break;
+ case 1:
+ (*pOutput++) = pending.f[0];
+ (*pOutput++) = (float)out.d[0];
+ pending.f[0] = (float)out.d[1];
+ break;
+ case 2:
+ (*pOutput++) = pending.f[0];
+ (*pOutput++) = pending.f[1];
+ pending.f[0] = (float)out.d[0];
+ pending.f[1] = (float)out.d[1];
+ break;
+ case 3:
+ (*pOutput++) = pending.f[0];
+ (*pOutput++) = pending.f[1];
+ (*pOutput++) = pending.f[2];
+ pending.f[0] = (float)out.d[0];
+ pending.f[1] = (float)out.d[1];
+ break;
+ default:
+ assert(false);
+ }
+ }
+
+ if(alignment_out == UNALIGNED) alignment_out = 0;
+ for( int i=0 ; i < alignment_out ; ++i ) {
+ (*pOutput++) = pending.f[i];
+ }
+ assert( (intptr_t)pInput == (intptr_t)(pIn + iBufferSize) );
+ assert( (intptr_t)pOutput == (intptr_t)(pOut + iBufferSize) );
+
+ /* PUSH BACK THE CURRENT OBJECT STATE
+ */
+ _k = k;
+ memcpy(_xv, XY, 2*ORDER*sizeof(__m128d));
+
+ // Check for denormals
+ for (i=0; i<=ORDER; ++i)
+ {
+ _xv[i].d[0] = zap_denormal(_xv[i].d[0]);
+ _xv[i].d[1] = zap_denormal(_xv[i].d[1]);
+ _yv[i].d[0] = zap_denormal(_yv[i].d[0]);
+ _yv[i].d[1] = zap_denormal(_yv[i].d[1]);
+ }
+ }
+#endif // IIR_ENABLE_SSE3
+
+ FilterReference::FilterReference(int iOrder, const double* pCoefs)
+ {
+ order = iOrder;
+ coefs = pCoefs;
+
+ // Reset the yv's:
+ memset(yv1, 0, sizeof(yv1));
+ memset(yv2, 0, sizeof(yv2));
+ memset(xv1, 0, sizeof(xv1));
+ memset(xv2, 0, sizeof(xv2));
+ }
+
+ FilterReference::~FilterReference()
+ {
+ }
+
+ void FilterReference::process(const CSAMPLE * pIn, const CSAMPLE * pOut, const int iBufferSize)
+ {
+ CSAMPLE * pOutput = (CSAMPLE *)pOut;
+ double GAIN = coefs[0];
+ int i;
+ for (i=0; i<iBufferSize; i+=2)
+ {
+ if (order==8)
+ {
+ //8th order:
+ // Channel 1
+ xv1[0] = xv1[1]; xv1[1] = xv1[2]; xv1[2] = xv1[3]; xv1[3] = xv1[4];
+ xv1[4] = xv1[5]; xv1[5] = xv1[6]; xv1[6] = xv1[7]; xv1[7] = xv1[8];
+ xv1[8] = pIn[i]/GAIN;
+ yv1[0] = yv1[1]; yv1[1] = yv1[2]; yv1[2] = yv1[3]; yv1[3] = yv1[4];
+ yv1[4] = yv1[5]; yv1[5] = yv1[6]; yv1[6] = yv1[7]; yv1[7] = yv1[8];
+ yv1[8] = (xv1[0] + xv1[8]) + coefs[1] * (xv1[1] + xv1[7]) +
+ coefs[2] * (xv1[2] + xv1[6]) +
+ coefs[3] * (xv1[3] + xv1[5]) + coefs[4] * xv1[4] +
+ (coefs[5] * yv1[0]) + ( coefs[6] * yv1[1]) +
+ (coefs[7] * yv1[2]) + ( coefs[8] * yv1[3]) +
+ (coefs[9] * yv1[4]) + ( coefs[10] * yv1[5]) +
+ (coefs[11] * yv1[6]) + ( coefs[12] * yv1[7]);
+ assert(yv1[8]<100000 || yv1[8]>-100000);
+ pOutput[i] = yv1[8];
+
+ // Channel 2
+ xv2[0] = xv2[1]; xv2[1] = xv2[2]; xv2[2] = xv2[3]; xv2[3] = xv2[4];
+ xv2[4] = xv2[5]; xv2[5] = xv2[6]; xv2[6] = xv2[7]; xv2[7] = xv2[8];
+ xv2[8] = pIn[i+1]/GAIN;
+ yv2[0] = yv2[1]; yv2[1] = yv2[2]; yv2[2] = yv2[3]; yv2[3] = yv2[4];
+ yv2[4] = yv2[5]; yv2[5] = yv2[6]; yv2[6] = yv2[7]; yv2[7] = yv2[8];
+ yv2[8] = (xv2[0] + xv2[8]) + coefs[1] * (xv2[1] + xv2[7]) +
+ coefs[2] * (xv2[2] + xv2[6]) +
+ coefs[3] * (xv2[3] + xv2[5]) + coefs[4] * xv2[4] +
+ (coefs[5] * yv2[0]) + ( coefs[6] * yv2[1]) +
+ (coefs[7] * yv2[2]) + ( coefs[8] * yv2[3]) +
+ (coefs[9] * yv2[4]) + ( coefs[10] * yv2[5]) +
+ (coefs[11] * yv2[6]) + ( coefs[12] * yv2[7]);
+ assert(yv2[8]<100000 || yv2[8]>-100000);
+ pOutput[i+1] = yv2[8];
+ }
+ else if (order==2)
+ {
+ // Second order
+ xv1[0] = xv1[1]; xv1[1] = xv1[2];
+ xv1[2] = pIn[i] / GAIN;
+ yv1[0] = yv1[1]; yv1[1] = yv1[2];
+ yv1[2] = (xv1[0] + xv1[2]) + coefs[1] * xv1[1] + ( coefs[2] * yv1[0]) + (coefs[3] * yv1[1]);
+ pOutput[i] = yv1[2];
+
+ xv2[0] = xv2[1]; xv2[1] = xv2[2];
+ xv2[2] = pIn[i+1] / GAIN;
+ yv2[0] = yv2[1]; yv2[1] = yv2[2];
+ yv2[2] = (xv2[0] + xv2[2]) + coefs[1] * xv2[1] + ( coefs[2] * yv2[0]) + (coefs[3] * yv2[1]);
+ pOutput[i+1] = yv2[2];
+ }
+ else
+ {
+ // Fourth order
+ xv1[0] = xv1[1]; xv1[1] = xv1[2]; xv1[2] = xv1[3]; xv1[3] = xv1[4];
+ xv1[4] = pIn[i] / GAIN;
+ yv1[0] = yv1[1]; yv1[1] = yv1[2]; yv1[2] = yv1[3]; yv1[3] = yv1[4];
+ yv1[4] = (xv1[0] + xv1[4]) + coefs[1]*(xv1[1]+xv1[3]) + coefs[2] * xv1[2]
+ + ( coefs[3] * yv1[0]) + ( coefs[4] * yv1[1])
+ + ( coefs[5] * yv1[2]) + ( coefs[6] * yv1[3]);
+ pOutput[i] = yv1[4];
+
+ xv2[0] = xv2[1]; xv2[1] = xv2[2]; xv2[2] = xv2[3]; xv2[3] = xv2[4];
+ xv2[4] = pIn[i+1] / GAIN;
+ yv2[0] = yv2[1]; yv2[1] = yv2[2]; yv2[2] = yv2[3]; yv2[3] = yv2[4];
+ yv2[4] = (xv2[0] + xv2[4]) + coefs[1]*(xv2[1]+xv2[3]) + coefs[2] * xv2[2]
+ + ( coefs[3] * yv2[0]) + ( coefs[4] * yv2[1])
+ + ( coefs[5] * yv2[2]) + ( coefs[6] * yv2[3]);
+ pOutput[i+1] = yv2[4];
+ }
+ }
+
+ // Check for denormals
+ for (i=0; i<=order; ++i)
+ {
+ xv1[i] = zap_denormal(xv1[i]);
+ yv1[i] = zap_denormal(yv1[i]);
+ xv2[i] = zap_denormal(xv2[i]);
+ yv2[i] = zap_denormal(yv2[i]);
+ }
}
-}
+} // namespace DetailsEngineFilterIIR
diff --git a/src/engine/enginefilteriir.h b/src/engine/enginefilteriir.h
index 3c93553..73362fe 100644
--- a/src/engine/enginefilteriir.h
+++ b/src/engine/enginefilteriir.h
@@ -24,15 +24,10 @@ class EngineFilterIIR : public EngineObject
{
public:
EngineFilterIIR(const double *pCoefs, int iOrder);
- ~EngineFilterIIR();
+ virtual ~EngineFilterIIR();
void process(const CSAMPLE *pIn, const CSAMPLE *pOut, const int iBufferSize);
protected:
- int order;
- const double *coefs;
- #define MAXNZEROS 8
- #define MAXNPOLES 8
- double xv1[MAXNZEROS+1], yv1[MAXNPOLES+1];
- double xv2[MAXNZEROS+1], yv2[MAXNPOLES+1];
+ EngineObject * _d;
};
//
------------------------------------------------------------------------------
Gaining the trust of online customers is vital for the success of any company
that requires sensitive data to be transmitted over the Web. Learn how to
best implement a security strategy that keeps consumers' information secure
and instills the confidence they need to proceed with transactions.
http://p.sf.net/sfu/oracle-sfdevnl
_______________________________________________
Mixxx-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/mixxx-devel