Try it without -profile.  That tends to slow everything down to a halt.

-Craig

"Andrej Mitrovic" <[email protected]> wrote in message news:[email protected]...
My crappy old Pentium 4 has gone totally mad:

Sorting with Array.opIndex: 45080
Sorting with pointers: 45608
4.092e+16 percent faster (??)

Compiled with dmd -profile -O -release -inline -noboundscheck

On 12/13/10, Iain Buclaw <[email protected]> wrote:
== Quote from Craig Black ([email protected])'s article
The following program illustrates the problems with inlining in the dmd
compiler.  Perhaps with some more work I can reduce it to a smaller test
case. I was playing around with a simple Array template, and noticed that similar C++ code performs much better. This is due, at least in part, to opIndex not being properly inlined by dmd. There are two sort functions,
quickSort1 and quickSort2.  quickSort1 indexes an Array data structure.
quickSort2 indexes raw pointers.  quickSort2 is roughly 20% faster on my
core i7.
import std.stdio;
import std.date;
import std.random;
import std.conv;
import std.algorithm;
import std.range;
struct Array(T)
{
public:
  this(int length) { resize(length); }
  this(T[] a) { append(a); }
  this(this)
  {
    if(!base.array) return;
    ArrayBase!T old;
    old = base;
    base.array = null;
    reserve(old.length(), old.length());
    copyData(old);
    old.array = null;
  }
  void clear() { base.clear(); }
  void resize(int sz)
  {
    assert(sz >= 0);
    if(sz == 0) return clear();
    if(!base.array) reserve(sz, sz);
    *base.lengthPtr() = sz;
  }
  void reserve(int capacity, int length)
  {
    assert(length <= capacity);
    if(capacity == 0) return clear();
    ArrayBase!T old;
    if(base.array)
    {
      if(base.capacity() >= capacity) return;
      old.array = base.array;
      base.array = null;
    }
    base.array = cast(ubyte*)(new ubyte[capacity*T.sizeof+8]);
    *base.lengthPtr() = length;
    *base.capacityPtr() = capacity;
    for(int i = 0; i < capacity; i++) emplace!T(base.data()+i);
    if(old.array) copyData(old);
  }
  int length() const { return base.length(); }
  int capacity() const { return base.capacity(); }
  bool empty() const { return base.array == null; }
  ref T at(int i)
  {
assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i)
~
" out of bounds of empty array");
assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~
to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1));
    return base.data()[i];
  }
  ref const T at(int i) const
  {
assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i)
~
" out of bounds of empty array");
assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~
to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1));
    return base.data()[i];
  }
  const ref T opIndex(int i) const { return at(i); }
  void opIndexAssign(T t, int i) { at(i) = t; }
  void opIndexAssign(ref T t, int i) { at(i) = t; }
  void opAssign(ref const Array!T array)
  {
    copy(array);
  }
  void opAssign(T[] array)
  {
    int len = array.length;
    resize(len);
    for(int i = 0; i < len; i++) at(i) = array[i];
  }
  void copy(ref const Array!T array)
  {
    if(array.empty()) return clear();
    int len = array.length();
    reserve(len, len);
    for(int i = 0; i < len; i++) at(i) = array[i];
  }
  void opOpAssign(string op, A...)(A a) if(op == "~")
  {
    appendComposite(a);
  }
  ref T front() { return at(0); }
  const ref T front() const { return at(0); }
  ref T back() { return at(length()-1); }
  const ref T back() const { return at(length()-1); }
  ref T appendOne()
  {
    int sz = length();
    int sp = capacity();
    if(sp > sz) (*base.lengthPtr())++;
    else
    {
      sz++;
      sp = max(2, sp+sp/2, sz);
      reserve(sp, sz);
    }
    return back();
  }
  void append(A...)(A a)
  {
    static if(a.length == 1 && (is(typeof(a[0]) == Array!T) ||
is(typeof(a[0]) == T[])))
    {
      appendComposite(a[0]);
    } else {
      appendTuple(a);
    }
  }
  void appendTuple(A...)(A a)
  {
    foreach(x; a) appendOne() = x;
  }
  void appendComposite(A)(A a)
  {
    int prevLen = length();
    resize(prevLen + a.length);
    for(int i = 0; i < a.length; i++) at(i+prevLen) = a[i];
  }
private:
      static struct ArrayBase(T)
      {
      public:
            ~this() { clear(); }
            void clear() { if(array) delete array; }
            int length() const { return array ? *lengthPtr() : 0; }
            int capacity() const { return array ? *capacityPtr() : 0; }
            int* lengthPtr() const { return cast(int*)(array); }
            int* capacityPtr() const { return cast(int*)(array+4); }
            T* data() const { return cast(T*)(array+8); }
            ubyte* array;
      }
  ArrayBase!T base;
  void copyData(ref ArrayBase!T array)
  {
    int copyLen = min(length, array.length());
    for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; }
  }
}
static bool less(T)(T a, T b) { return a < b; }
void insertionSort1(T, alias L = less!T)(ref Array!T a, int low, int high)
{
  for(int i = low; i <= high; i++)
  {
    int min = i;
    for(int j = i + 1; j <= high; j++)
      if(L(a[j], a[min])) min = j;
    swap(a[i], a[min]);
  }
}
int partition1(T, alias L = less!T)(ref Array!T a, int p, int r)
{
      T x = a[r];
      int j = p - 1;
      for (int i = p; i < r; i++)
      {
            if (L(x, a[i])) continue;
            swap(a[i], a[++j]);
      }
      a[r] = a[j + 1];
      a[j + 1] = x;
      return j + 1;
}
void quickSort1(T, alias L = less!T)(ref Array!T a, int p, int r)
{
  if(p + 7 > r) return insertionSort1!(T, L)(a, p, r);
  if (p < r)
      {
            int q = partition1!(T, L)(a, p, r);
            quickSort1!(T, L)(a, p, q - 1);
            quickSort1!(T, L)(a, q + 1, r);
      }
}
void sort1(T, alias L = less!T)(ref Array!T a) { quickSort1!(T, L)(a, 0,
a.length()-1); }
void insertionSort2(T, alias L = less!T)(T *a, int low, int high)
{
  for(int i = low; i <= high; i++)
  {
    int min = i;
    for(int j = i + 1; j <= high; j++)
      if(L(a[j], a[min])) min = j;
    swap(a[i], a[min]);
  }
}
int partition2(T, alias L = less!T)(T *a, int p, int r)
{
      T x = a[r];
      int j = p - 1;
      for (int i = p; i < r; i++)
      {
            if (L(x, a[i])) continue;
            swap(a[i], a[++j]);
      }
      a[r] = a[j + 1];
      a[j + 1] = x;
      return j + 1;
}
void quickSort2(T, alias L = less!T)(T *a, int p, int r)
{
  if(p + 7 > r) return insertionSort2!(T, L)(a, p, r);
  if (p < r)
      {
            int q = partition2!(T, L)(a, p, r);
            quickSort2!(T, L)(a, p, q - 1);
            quickSort2!(T, L)(a, q + 1, r);
      }
}
void sort2(T, alias L = less!T)(T *a, int length) { quickSort2!(T, L)(a,
0,
length-1); }
double[] vals;
void bench1()
{
  Array!double v;
  for(int i = 0; i < 100; i++)
  {
    v = vals;
    sort1(v);
  }
}
void bench2()
{
  Array!double v;
  for(int i = 0; i < 100; i++)
  {
    v = vals;
    sort2(&v[0], v.length);
  }
}
void main()
{
  Mt19937 gen;
  vals.length = 1000;
  for(int i = 0; i < 1000; i++) vals[i] = uniform(0.0,1000.0);
  ulong[] times = [0, 0];
  for(int i = 0; i < 100; i++)
  {
    auto times2 = benchmark!(bench1, bench2)(1);
    times[0] += times2[0];
    times[1] += times2[1];
  }
  writeln("Sorting with Array.opIndex: ", times[0]);
  writeln("Sorting with pointers: ", times[1]);
  writeln(100.0 * (times[0]-times[1]) / times[0], " percent faster");
}

Testing on GDC with a Netbook, results from 3 consecutive runs are:

Without -frelease
-------
Sorting with Array.opIndex: 27981
Sorting with pointers: 5602
79.9793 percent faster

Sorting with Array.opIndex: 25565
Sorting with pointers: 5179
79.7418 percent faster

Sorting with Array.opIndex: 28657
Sorting with pointers: 5772
79.8583 percent faster
-------


With -frelease
-------
Sorting with Array.opIndex: 10591
Sorting with pointers: 4771
54.9523 percent faster

Sorting with Array.opIndex: 10289
Sorting with pointers: 4710
54.223 percent faster

Sorting with Array.opIndex: 11305
Sorting with pointers: 5216
53.8611 percent faster
-------


With -frelease -fno-bounds-check
-------
Sorting with Array.opIndex: 11651
Sorting with pointers: 5236
55.0597 percent faster

Sorting with Array.opIndex: 9873
Sorting with pointers: 4559
53.8236 percent faster

Sorting with Array.opIndex: 10361
Sorting with pointers: 4745
54.2033 percent faster
-------


GDC doesn't use DMD's FE inliner, but results from the GCC backend's
inliner:
-------
Considering inline candidate check.
 Inlining check into fillUp.
Merging blocks 9 and 10
Merging blocks 9 and 11

Considering inline candidate initialize.
 Inlining initialize into emplace.
Merging blocks 2 and 3
Merging blocks 2 and 4

Considering inline candidate bench2.
Not inlining: code size would grow by 77 insns.
Considering inline candidate bench1.
Not inlining: code size would grow by 49 insns.
-------


So there's _me_ seriously doubting that inlining has anything to do with the
50% increase.

Regards
Iain


Reply via email to