I use sets a lot and am now working on a program that will need to hold sets of 65,000+ items, so I thought I do some timings for the different approaches.

Here are some timings (uset uses the AA Unit approach, tset uses an rbtree, and aset uses an AA with bool values):

$ ./sets.d
size 50,000
uset 1 ms, 340 μs, and 8 hnsecs
tset 4 ms, 637 μs, and 1 hnsec
aset 1 ms, 402 μs, and 6 hnsecs
$ ./sets.d
size 100,000
uset 2 ms, 338 μs, and 4 hnsecs
tset 12 ms, 262 μs, and 6 hnsecs
aset 2 ms and 991 μs
$ ./sets.d
size 200,000
uset 5 ms, 971 μs, and 5 hnsecs
tset 30 ms, 675 μs, and 5 hnsecs
aset 6 ms, 74 μs, and 6 hnsecs
$ ./sets.d
size 400,000
uset 11 ms, 823 μs, and 4 hnsecs
tset 74 ms, 146 μs, and 2 hnsecs
aset 12 ms, 560 μs, and 5 hnsecs

What seems pretty clear is that for my purposes (checking presence or absence of membership in a set), AAs are much faster than rbtrees. (This is to be expected since if an AA uses a hash is should be O(1) vs O(lg n) for an rbtree).

Here is the test code I used:

#!/usr/bin/env rdmd

enum SIZE = 400_000;
enum SUB_SIZE = SIZE / 10;
enum MIN_LEN = 10;
enum MAX_LEN = 50;
enum AZ = "abcdefghijklmnopqrstuvwxyz";

void main() {
    import std.stdio: writefln;

    auto data = Data.populate();
    auto sets = Sets(data.words);

    writefln("size %,d", SIZE);
    check(sets.uset, data.present, data.absent, "uset");
    check(sets.tset, data.present, data.absent, "tset");
    check(sets.aset, data.present, data.absent, "aset");
}

struct Sets {
    import std.container.rbtree: RedBlackTree;

    alias Unit = void[0];
    enum unit = Unit.init;

    Unit[string] uset;
    RedBlackTree!string tset;
    bool[string] aset;

    this(string[] words) {
        tset = new RedBlackTree!string;
        foreach (word; words) {
            uset[word] = unit;
            tset.insert(word);
            aset[word] = false;
        }
    }
}

struct Data {
    string[] words;
    string[] present;
    string[] absent;

    static Data populate() {
        Data data;
        for (int i = 0; i < SIZE; i++) {
            auto word = makeWord;
            data.words ~= word;
            if (data.present.length < SUB_SIZE)
                data.present ~= word;
            if (data.absent.length < SUB_SIZE)
                data.absent ~= word ~ "9";
        }
        return data;
    }
}

string makeWord() {
    import std.random: randomCover, uniform;
    import std.range: take;
    import std.string: join, split;

    enum AZS = (AZ ~ AZ ~ AZ ~ AZ ~ AZ ~ AZ).split("");
return randomCover(AZS).take(uniform(MIN_LEN, MAX_LEN)).join("");
}

void check(T)(T set, string[] present, string[] absent, string name) {
    import std.datetime.stopwatch: AutoStart, StopWatch;
    import std.stdio: writeln;

    auto timer = StopWatch(AutoStart.yes);
    foreach (string p; present)
        assert(p in set);
    foreach (string a; absent)
        assert(a !in set);
    writeln(name, " ", timer.peek);
}

Reply via email to