How would one do something like this without intrinsics (the code is c++ using 
gcc vector extensions): 

template <class V>
struct Fft 
{
  typedef typename V::T T;
  typedef typename V::vec vec;
  static const int VecSize = V::Size;

...

  template <int Interleaved>
  static NOINLINE void fft_pass_interleaved(
    vec * __restrict pr, 
    vec *__restrict pi, 
    vec *__restrict pend, 
    T *__restrict table)  
  {
    for(; pr < pend; pr += 2, pi += 2, table += 2*Interleaved)
    {
      vec tmpr, ti, ur, ui, wr, wi;
      V::template expandComplexArrayToRealImagVec<Interleaved>(table, wr, wi);
      V::template deinterleave<Interleaved>(pr[0],pr[1], ur, tmpr);
      V::template deinterleave<Interleaved>(pi[0],pi[1], ui, ti);
      vec tr = tmpr*wr - ti*wi;
      ti = tmpr*wi + ti*wr;
      V::template interleave<Interleaved>(ur + tr, ur - tr, pr[0], pr[1]);
      V::template interleave<Interleaved>(ui + ti, ui - ti, pi[0], pi[1]);
    }
  }

...

Here vector elements need to be shuffled around when they are loaded and 
stored. 
This is platform dependent and cannot be expressed through vector operations 
(or gcc vector extensions).  Here I abstracted platform dependent functionality 
in member functions of  V, which are implemented using intrinsics.  The 
assembly 
generated for SSE single precision and Interleaved=4 is:

 0000000000000000 
<_ZN3FftI6SSEVecIfEE20fft_pass_interleavedILi4EEEvPDv4_fS5_S5_Pf>:
   0:   48 39 d7                cmp    %rdx,%rdi
   3:   0f 83 9c 00 00 00       jae    a5 
<_ZN3FftI6SSEVecIfEE20fft_pass_interleavedILi4EEEvPDv4_fS5_S5_Pf+0xa5>
   9:   0f 1f 80 00 00 00 00    nopl   0x0(%rax)
  10:   0f 28 19                movaps (%rcx),%xmm3
  13:   0f 28 41 10             movaps 0x10(%rcx),%xmm0
  17:   48 83 c1 20             add    $0x20,%rcx
  1b:   0f 28 f3                movaps %xmm3,%xmm6
  1e:   0f 28 2f                movaps (%rdi),%xmm5
  21:   0f c6 d8 dd             shufps $0xdd,%xmm0,%xmm3
  25:   0f c6 f0 88             shufps $0x88,%xmm0,%xmm6
  29:   0f 28 e5                movaps %xmm5,%xmm4
  2c:   0f 28 47 10             movaps 0x10(%rdi),%xmm0
  30:   0f 28 4e 10             movaps 0x10(%rsi),%xmm1
  34:   0f c6 e0 88             shufps $0x88,%xmm0,%xmm4
  38:   0f c6 e8 dd             shufps $0xdd,%xmm0,%xmm5
  3c:   0f 28 06                movaps (%rsi),%xmm0
  3f:   0f 28 d0                movaps %xmm0,%xmm2
  42:   0f c6 c1 dd             shufps $0xdd,%xmm1,%xmm0
  46:   0f c6 d1 88             shufps $0x88,%xmm1,%xmm2
  4a:   0f 28 cd                movaps %xmm5,%xmm1
  4d:   0f 28 f8                movaps %xmm0,%xmm7
  50:   0f 59 ce                mulps  %xmm6,%xmm1
  53:   0f 59 fb                mulps  %xmm3,%xmm7
  56:   0f 59 c6                mulps  %xmm6,%xmm0
  59:   0f 59 dd                mulps  %xmm5,%xmm3
  5c:   0f 5c cf                subps  %xmm7,%xmm1
  5f:   0f 58 c3                addps  %xmm3,%xmm0
  62:   0f 28 dc                movaps %xmm4,%xmm3
  65:   0f 5c d9                subps  %xmm1,%xmm3
  68:   0f 58 cc                addps  %xmm4,%xmm1
  6b:   0f 28 e1                movaps %xmm1,%xmm4
  6e:   0f 15 cb                unpckhps %xmm3,%xmm1
  71:   0f 14 e3                unpcklps %xmm3,%xmm4
  74:   0f 29 4f 10             movaps %xmm1,0x10(%rdi)
  78:   0f 28 ca                movaps %xmm2,%xmm1
  7b:   0f 29 27                movaps %xmm4,(%rdi)
  7e:   0f 5c c8                subps  %xmm0,%xmm1
  81:   48 83 c7 20             add    $0x20,%rdi
  85:   0f 58 c2                addps  %xmm2,%xmm0
  88:   0f 28 d0                movaps %xmm0,%xmm2
  8b:   0f 15 c1                unpckhps %xmm1,%xmm0
  8e:   0f 14 d1                unpcklps %xmm1,%xmm2
  91:   0f 29 46 10             movaps %xmm0,0x10(%rsi)
  95:   0f 29 16                movaps %xmm2,(%rsi)
  98:   48 83 c6 20             add    $0x20,%rsi
  9c:   48 39 fa                cmp    %rdi,%rdx
  9f:   0f 87 6b ff ff ff       ja     10 
<_ZN3FftI6SSEVecIfEE20fft_pass_interleavedILi4EEEvPDv4_fS5_S5_Pf+0x10>
  a5:   f3 c3                   repz retq 

Would something like that be possible with D inline assembly or would there be 
additional loads and stores for each call of V::interleave, V::deinterleave 
and V::expandComplexArrayToRealImagVec?

Reply via email to