I have tested dmd v2.047beta and it works. I have seen this bug fix improves 
the performance of my code:
http://d.puremagic.com/issues/show_bug.cgi?id=2008

------------------

Walter Bright:

>it would be fair to provide examples when saying things like LLVM does a 
>better job on X.<

This is a D2 program, I compile it with v2.047beta, the total() function 
doesn't get inlined:

import std.c.stdio: printf;

int total(int[] data) {
    int res;
    foreach (x; data)
        res += x;
    return res;
}

void main() {
    enum int[] data = [7, 6, 5, 9, 8, 4, 3, 1, 2, 0];
    printf("%d\n", total(data));
}


asm generated by DMD, -O -release -inline (cleaned up a little):

_D4test5totalFAiZi      comdat
                push    EAX
                xor     ECX,ECX
                xor     EDX,EDX
                push    EBX
                cmp     0Ch[ESP],ECX
                je      L28
                mov     4[ESP],EDX
                mov     EDX,010h[ESP]
                mov     EBX,EDX
                mov     EAX,0Ch[ESP]
                mov     EDX,4[ESP]
L1E:            add     ECX,[EDX*4][EBX]
                inc     EDX
                cmp     EDX,0Ch[ESP]
                jb      L1E
L28:            pop     EBX
                mov     EAX,ECX
                pop     ECX
                ret     8

__Dmain comdat
L0:             push    EAX
                push    EAX
                mov     EAX,offset FLAT:_D11TypeInfo_Ai6__initZ
                push    EBX
                push    0
                push    2
                push    1
                push    3
                push    4
                push    8
                push    9
                push    5
                push    6
                push    7
                push    0Ah
                push    EAX
                call    near ptr __d_arrayliteralT
                add     ESP,030h
                mov     ECX,EAX
                push    ECX
                mov     EBX,0Ah
                push    EBX
                call    near ptr _D4test5totalFAiZi
                mov     EDX,offset FLAT:_DATA
                push    EAX
                push    EDX
                call    near ptr _printf
                add     ESP,8
                xor     EAX,EAX
                pop     EBX
                add     ESP,8
                ret

------------------

This is the same program translated to D1 for Tango:


import tango.stdc.stdio: printf;

int total(int[] data) {
    int res;
    foreach (x; data)
        res += x;
    return res;
}

void main() {
    const int[] data = [7, 6, 5, 9, 8, 4, 3, 1, 2, 0];
    printf("%d\n", total(data));
}


The asm generated by LDC, -O3 -release -inline:

_D4temp5totalFAiZi:
        pushl   %esi
        movl    8(%esp), %ecx
        testl   %ecx, %ecx
        je      .LBB1_4
        movl    12(%esp), %edx
        xorl    %eax, %eax
        movl    %eax, %esi
        .align  16
.LBB1_2:
        addl    (%edx,%esi,4), %eax
        incl    %esi
        cmpl    %ecx, %esi
        jne     .LBB1_2
.LBB1_3:
        popl    %esi
        ret     $8
.LBB1_4:
        xorl    %eax, %eax
        jmp     .LBB1_3

        .type   .constarray,@object
        .data
        .align  16
.constarray:
        .long   7
        .long   6
        .long   5
        .long   9
        .long   8
        .long   4
        .long   3
        .long   1
        .long   2
        .zero   4
        .size   .constarray, 40

_Dmain:
        subl    $12, %esp
        movl    .constarray+4, %eax
        addl    .constarray, %eax
        addl    .constarray+8, %eax
        addl    .constarray+12, %eax
        addl    .constarray+16, %eax
        addl    .constarray+20, %eax
        addl    .constarray+24, %eax
        addl    .constarray+28, %eax
        addl    .constarray+32, %eax
        addl    .constarray+36, %eax
        movl    %eax, 4(%esp)
        movl    $.str, (%esp)
        call    printf
        xorl    %eax, %eax
        addl    $12, %esp
        ret     $8


You can see ldc inlined total(), and in this case unrolls the loop too because 
the array is known at compile time. But it doesn't perform the last 
optimization.

---------------------------

If I use the Link-Time optimization with LDC it optimizes the code better, this 
is the disassembly of the main:

08049620 <_Dmain>:
 8049620:       83 ec 0c                sub    $0xc,%esp
 8049623:       c7 44 24 04 2d 00 00    movl   $0x2d,0x4(%esp)
 804962a:       00 
 804962b:       c7 04 24 48 35 06 08    movl   $0x8063548,(%esp)
 8049632:       e8 85 fd ff ff          call   80493bc <pri...@plt>
 8049637:       31 c0                   xor    %eax,%eax
 8049639:       83 c4 0c                add    $0xc,%esp
 804963c:       c2 08 00                ret    $0x8
 804963f:       90                      nop    

That movl $0x2d,0x4(%esp) is the result, 45 in base 10, fully computed.

Bye,
bearophile

Reply via email to