== Quote from Craig Black ([email protected])'s article > The following program illustrates the problems with inlining in the dmd > compiler. Perhaps with some more work I can reduce it to a smaller test > case. I was playing around with a simple Array template, and noticed that > similar C++ code performs much better. This is due, at least in part, to > opIndex not being properly inlined by dmd. There are two sort functions, > quickSort1 and quickSort2. quickSort1 indexes an Array data structure. > quickSort2 indexes raw pointers. quickSort2 is roughly 20% faster on my > core i7. > import std.stdio; > import std.date; > import std.random; > import std.conv; > import std.algorithm; > import std.range; > struct Array(T) > { > public: > this(int length) { resize(length); } > this(T[] a) { append(a); } > this(this) > { > if(!base.array) return; > ArrayBase!T old; > old = base; > base.array = null; > reserve(old.length(), old.length()); > copyData(old); > old.array = null; > } > void clear() { base.clear(); } > void resize(int sz) > { > assert(sz >= 0); > if(sz == 0) return clear(); > if(!base.array) reserve(sz, sz); > *base.lengthPtr() = sz; > } > void reserve(int capacity, int length) > { > assert(length <= capacity); > if(capacity == 0) return clear(); > ArrayBase!T old; > if(base.array) > { > if(base.capacity() >= capacity) return; > old.array = base.array; > base.array = null; > } > base.array = cast(ubyte*)(new ubyte[capacity*T.sizeof+8]); > *base.lengthPtr() = length; > *base.capacityPtr() = capacity; > for(int i = 0; i < capacity; i++) emplace!T(base.data()+i); > if(old.array) copyData(old); > } > int length() const { return base.length(); } > int capacity() const { return base.capacity(); } > bool empty() const { return base.array == null; } > ref T at(int i) > { > assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ > " out of bounds of empty array"); > assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~ > to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1)); > return base.data()[i]; > } > ref const T at(int i) const > { > assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ > " out of bounds of empty array"); > assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~ > to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1)); > return base.data()[i]; > } > const ref T opIndex(int i) const { return at(i); } > void opIndexAssign(T t, int i) { at(i) = t; } > void opIndexAssign(ref T t, int i) { at(i) = t; } > void opAssign(ref const Array!T array) > { > copy(array); > } > void opAssign(T[] array) > { > int len = array.length; > resize(len); > for(int i = 0; i < len; i++) at(i) = array[i]; > } > void copy(ref const Array!T array) > { > if(array.empty()) return clear(); > int len = array.length(); > reserve(len, len); > for(int i = 0; i < len; i++) at(i) = array[i]; > } > void opOpAssign(string op, A...)(A a) if(op == "~") > { > appendComposite(a); > } > ref T front() { return at(0); } > const ref T front() const { return at(0); } > ref T back() { return at(length()-1); } > const ref T back() const { return at(length()-1); } > ref T appendOne() > { > int sz = length(); > int sp = capacity(); > if(sp > sz) (*base.lengthPtr())++; > else > { > sz++; > sp = max(2, sp+sp/2, sz); > reserve(sp, sz); > } > return back(); > } > void append(A...)(A a) > { > static if(a.length == 1 && (is(typeof(a[0]) == Array!T) || > is(typeof(a[0]) == T[]))) > { > appendComposite(a[0]); > } else { > appendTuple(a); > } > } > void appendTuple(A...)(A a) > { > foreach(x; a) appendOne() = x; > } > void appendComposite(A)(A a) > { > int prevLen = length(); > resize(prevLen + a.length); > for(int i = 0; i < a.length; i++) at(i+prevLen) = a[i]; > } > private: > static struct ArrayBase(T) > { > public: > ~this() { clear(); } > void clear() { if(array) delete array; } > int length() const { return array ? *lengthPtr() : 0; } > int capacity() const { return array ? *capacityPtr() : 0; } > int* lengthPtr() const { return cast(int*)(array); } > int* capacityPtr() const { return cast(int*)(array+4); } > T* data() const { return cast(T*)(array+8); } > ubyte* array; > } > ArrayBase!T base; > void copyData(ref ArrayBase!T array) > { > int copyLen = min(length, array.length()); > for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; } > } > } > static bool less(T)(T a, T b) { return a < b; } > void insertionSort1(T, alias L = less!T)(ref Array!T a, int low, int high) > { > for(int i = low; i <= high; i++) > { > int min = i; > for(int j = i + 1; j <= high; j++) > if(L(a[j], a[min])) min = j; > swap(a[i], a[min]); > } > } > int partition1(T, alias L = less!T)(ref Array!T a, int p, int r) > { > T x = a[r]; > int j = p - 1; > for (int i = p; i < r; i++) > { > if (L(x, a[i])) continue; > swap(a[i], a[++j]); > } > a[r] = a[j + 1]; > a[j + 1] = x; > return j + 1; > } > void quickSort1(T, alias L = less!T)(ref Array!T a, int p, int r) > { > if(p + 7 > r) return insertionSort1!(T, L)(a, p, r); > if (p < r) > { > int q = partition1!(T, L)(a, p, r); > quickSort1!(T, L)(a, p, q - 1); > quickSort1!(T, L)(a, q + 1, r); > } > } > void sort1(T, alias L = less!T)(ref Array!T a) { quickSort1!(T, L)(a, 0, > a.length()-1); } > void insertionSort2(T, alias L = less!T)(T *a, int low, int high) > { > for(int i = low; i <= high; i++) > { > int min = i; > for(int j = i + 1; j <= high; j++) > if(L(a[j], a[min])) min = j; > swap(a[i], a[min]); > } > } > int partition2(T, alias L = less!T)(T *a, int p, int r) > { > T x = a[r]; > int j = p - 1; > for (int i = p; i < r; i++) > { > if (L(x, a[i])) continue; > swap(a[i], a[++j]); > } > a[r] = a[j + 1]; > a[j + 1] = x; > return j + 1; > } > void quickSort2(T, alias L = less!T)(T *a, int p, int r) > { > if(p + 7 > r) return insertionSort2!(T, L)(a, p, r); > if (p < r) > { > int q = partition2!(T, L)(a, p, r); > quickSort2!(T, L)(a, p, q - 1); > quickSort2!(T, L)(a, q + 1, r); > } > } > void sort2(T, alias L = less!T)(T *a, int length) { quickSort2!(T, L)(a, 0, > length-1); } > double[] vals; > void bench1() > { > Array!double v; > for(int i = 0; i < 100; i++) > { > v = vals; > sort1(v); > } > } > void bench2() > { > Array!double v; > for(int i = 0; i < 100; i++) > { > v = vals; > sort2(&v[0], v.length); > } > } > void main() > { > Mt19937 gen; > vals.length = 1000; > for(int i = 0; i < 1000; i++) vals[i] = uniform(0.0,1000.0); > ulong[] times = [0, 0]; > for(int i = 0; i < 100; i++) > { > auto times2 = benchmark!(bench1, bench2)(1); > times[0] += times2[0]; > times[1] += times2[1]; > } > writeln("Sorting with Array.opIndex: ", times[0]); > writeln("Sorting with pointers: ", times[1]); > writeln(100.0 * (times[0]-times[1]) / times[0], " percent faster"); > }
Testing on GDC with a Netbook, results from 3 consecutive runs are: Without -frelease ------- Sorting with Array.opIndex: 27981 Sorting with pointers: 5602 79.9793 percent faster Sorting with Array.opIndex: 25565 Sorting with pointers: 5179 79.7418 percent faster Sorting with Array.opIndex: 28657 Sorting with pointers: 5772 79.8583 percent faster ------- With -frelease ------- Sorting with Array.opIndex: 10591 Sorting with pointers: 4771 54.9523 percent faster Sorting with Array.opIndex: 10289 Sorting with pointers: 4710 54.223 percent faster Sorting with Array.opIndex: 11305 Sorting with pointers: 5216 53.8611 percent faster ------- With -frelease -fno-bounds-check ------- Sorting with Array.opIndex: 11651 Sorting with pointers: 5236 55.0597 percent faster Sorting with Array.opIndex: 9873 Sorting with pointers: 4559 53.8236 percent faster Sorting with Array.opIndex: 10361 Sorting with pointers: 4745 54.2033 percent faster ------- GDC doesn't use DMD's FE inliner, but results from the GCC backend's inliner: ------- Considering inline candidate check. Inlining check into fillUp. Merging blocks 9 and 10 Merging blocks 9 and 11 Considering inline candidate initialize. Inlining initialize into emplace. Merging blocks 2 and 3 Merging blocks 2 and 4 Considering inline candidate bench2. Not inlining: code size would grow by 77 insns. Considering inline candidate bench1. Not inlining: code size would grow by 49 insns. ------- So there's _me_ seriously doubting that inlining has anything to do with the 50% increase. Regards Iain
