Re: Does dmd have SSE intrinsics?

bearophile Tue, 22 Sep 2009 11:20:13 -0700

Jeremie Pelletier:

> The D memory manager already aligns data on 16 bytes boundaries. The 
> only case I can think of right now is when data is in a struct or class:


LDC doesn't align to 16 the normal arrays inside functions:
A small test program:

void main() {
    float[4] a = [1.0f, 2.0, 3.0, 4.0];
    float[4] b, c;        
    b[] = 10.0f;
    c[] = a[] + b[];
}


The ll code (the asm of the LLVM) LDC produces, this is the head:
ldc -O3 -inline -release -output-ll vect1.d

define x86_stdcallcc i32 @_Dmain(%"char[][]" %unnamed) {
entry:
  %a = alloca [4 x float], align 4                ; <[4 x float]*> [#uses=5]
  %b = alloca [4 x float], align 4                ; <[4 x float]*> [#uses=4]
  %c = alloca [4 x float], align 4                ; <[4 x float]*> [#uses=4]
  %.gc_mem = call noalias i8* @_d_newarrayvT(%object.TypeInfo* 
@_D11TypeInfo_Af6__initZ, i32 4) ; <i8*> [#uses=5]
[...]


The asm it produces for the whole main (the call to the array op is inlined, 
while _d_array_init_float is not inlined, I don't know why):
ldc -O3 -inline -release -output-s vect1.d

_Dmain:
        pushl   %esi
        subl    $64, %esp
        movl    $4, 4(%esp)
        movl    $_D11TypeInfo_Af6__initZ, (%esp)
        call    _d_newarrayvT
        movl    $1065353216, (%eax)
        movl    $1073741824, 4(%eax)
        movl    $1077936128, 8(%eax)
        movl    $1082130432, 12(%eax)
        movl    8(%eax), %ecx
        movl    %ecx, 56(%esp)
        movl    4(%eax), %ecx
        movl    %ecx, 52(%esp)
        movl    (%eax), %eax
        movl    %eax, 48(%esp)
        movl    $1082130432, 60(%esp)
        leal    32(%esp), %esi
        movl    %esi, (%esp)
        movl    $2143289344, 8(%esp)
        movl    $4, 4(%esp)
        call    _d_array_init_float
        leal    16(%esp), %eax
        movl    %eax, (%esp)
        movl    $2143289344, 8(%esp)
        movl    $4, 4(%esp)
        call    _d_array_init_float
        movl    %esi, (%esp)
        movl    $1092616192, 8(%esp)
        movl    $4, 4(%esp)
        call    _d_array_init_float
        movss   48(%esp), %xmm0
        addss   32(%esp), %xmm0
        movss   %xmm0, 16(%esp)
        movss   52(%esp), %xmm0
        addss   36(%esp), %xmm0
        movss   %xmm0, 20(%esp)
        movss   56(%esp), %xmm0
        addss   40(%esp), %xmm0
        movss   %xmm0, 24(%esp)
        movss   60(%esp), %xmm0
        addss   44(%esp), %xmm0
        movss   %xmm0, 28(%esp)
        xorl    %eax, %eax
        addl    $64, %esp
        popl    %esi
        ret     $8


By the way, using Link-Time Optimization and interning LDC produces this LL 
(whole main):

define x86_stdcallcc i32 @_Dmain(%"char[][]" %unnamed) {
entry:
  %b = alloca [4 x float], align 4                ; <[4 x float]*> [#uses=1]
  %c = alloca [4 x float], align 4                ; <[4 x float]*> [#uses=1]
  %.gc_mem = call noalias i8* @_d_newarrayvT(%object.TypeInfo* 
@_D11TypeInfo_Af6__initZ, i32 4) ; <i8*> [#uses=4]
  %.gc_mem1 = bitcast i8* %.gc_mem to float*      ; <float*> [#uses=1]
  store float 1.000000e+00, float* %.gc_mem1
  %tmp3 = getelementptr i8* %.gc_mem, i32 4       ; <i8*> [#uses=1]
  %0 = bitcast i8* %tmp3 to float*                ; <float*> [#uses=1]
  store float 2.000000e+00, float* %0
  %tmp4 = getelementptr i8* %.gc_mem, i32 8       ; <i8*> [#uses=1]
  %1 = bitcast i8* %tmp4 to float*                ; <float*> [#uses=1]
  store float 3.000000e+00, float* %1
  %tmp5 = getelementptr i8* %.gc_mem, i32 12      ; <i8*> [#uses=1]
  %2 = bitcast i8* %tmp5 to float*                ; <float*> [#uses=1]
  store float 4.000000e+00, float* %2
  %tmp8 = getelementptr [4 x float]* %b, i32 0, i32 0 ; <float*> [#uses=2]
  call void @_d_array_init_float(float* nocapture %tmp8, i32 4, float 
0x7FF8000000000000)
  %tmp9 = getelementptr [4 x float]* %c, i32 0, i32 0 ; <float*> [#uses=1]
  call void @_d_array_init_float(float* nocapture %tmp9, i32 4, float 
0x7FF8000000000000)
  call void @_d_array_init_float(float* nocapture %tmp8, i32 4, float 
1.000000e+01)
  ret i32 0
}


Bye,
bearophile

Re: Does dmd have SSE intrinsics?

Reply via email to