Porting some code from C to D I found the inline assembler very convenient. This is the C code (using an external NASM file):

        // dot_product returns dot product t*w of n elements.  n is rounded
        // up to a multiple of 8.  Result is scaled down by 8 bits.
        #ifdef NOASM  // no assembly language
        int dot_product(short *t, short *w, int n) {
          int sum=0;
          n=(n+7)&-8;
          for (int i=0; i<n; i+=2) {
if (lol >= 21567) printf("dp %d %d %d %d %d %d\n", n, i, t[i], w[i], t[i+1], w[i+1]);
            sum+=(t[i]*w[i]+t[i+1]*w[i+1]) >> 8;
          }
          return sum;
        }
        #else  // The NASM version uses MMX and is about 8 times faster.
        extern "C" int dot_product(short *t, short *w, int n);  // in NASM
        #endif

In D, I can move the ASM inside the function, so there is no need for two declarations:

        extern (C) int dot_product(short *t, short *w, const int n) {
            version (D_InlineAsm_X86_64) asm {
                naked;
                mov RCX, RDX;            // n
                mov RAX, RDI;            // a
                mov RDX, RSI;            // b
                cmp RCX, 0;
                jz done;
                sub RAX, 16;
                sub RDX, 16;
                pxor XMM0, XMM0;         // sum = 0
            loop:                        // each loop sums 4 products
movdqa XMM1, [RAX+RCX*2];// put parital sums of vector product in xmm1
                pmaddwd XMM1, [RDX+RCX*2];
                psrad XMM1, 8;
                paddd XMM0, XMM1;
                sub RCX, 8;
                ja loop;
                movdqa XMM1, XMM0;       // add 4 parts of xmm0 and return in 
eax
                psrldq XMM1, 8;
                paddd XMM0, XMM1;
                movdqa XMM1, XMM0;
                psrldq XMM1, 4;
                paddd XMM0, XMM1;
                movq RAX, XMM0;
            done:
                ret;
            } else {
                int sum = 0;
                for (int i = 0; i < n; i += 4) {
                    sum += (t[i  ]*w[i  ] + t[i+1]*w[i+1]) >> 8;
                    sum += (t[i+2]*w[i+2] + t[i+3]*w[i+3]) >> 8;
                }
                return sum;
            }
        }

This example also shows, how 'naked' should probably not be applied to the function declaration, because it contains non-asm code as well. (It could be "naked asm" though.) For compatibility with GDC (and in fact the original NASM code), I used extern(C) here as the parameter passing strategy.
This may also serve as a practical use case for vector operations.

Reply via email to