Re: Need a Faster Compressor

Marco Leise via Digitalmars-d Sun, 22 May 2016 07:12:04 -0700

⬅ Download proof of concept

This is a proof of concept micro-benchmark compressing the
37_640 bytes symbol from the bug report linked above with both
id_compress from the dmd backend and lz4 (without dictionary).
The lz4 result is additionally transformed to 7-bits similar
to base-64.


Original size: 37640 bytes

id_compress : 5243 ms for 10_000 iterations, 800 bytes
lz4         :   71 ms for 10_000 iterations, 389 bytes

That roughly corresponds to a 2x higher compression ratio at
70x faster compression speed.

-- 
Marco

import std.bitmanip;
import core.stdc.string;


struct S(T)
{
    void foo(){ throw new Exception("1");}
}

auto s(T)(T t)
{
    struct Result
    {
        void foo(){ throw new Exception("2");}
    }
    return Result();
}


void main()
{
    import core.stdc.stdlib;
    import std.stdio;
    import std.datetime;

    string text = 1.s.s.s.s.s.s.s.s.s.s.foo.mangleof;
    writefln("Original size: %s", text.length);
    size_t comprSize, comprSize2;
    char* buffer;
    StopWatch sw;
    sw.start();
    foreach (i; 0 .. 10_000)
    {
        buffer = cast(char*)malloc((LZ4_compressBound(text.length) * 8 + 6) / 7);
        comprSize = LZ4_compress_fast(text.ptr, buffer, text.length);
        comprSize2 = (comprSize * 8 + 6) / 7;
        foreach_reverse (k; 0 .. comprSize2)
        {
            size_t idx = k * 7 / 8;
            ubyte  bit = k * 7 % 8;
            ubyte _7bit = buffer[idx] >> bit;
            if (bit > 1)
                _7bit |= buffer[idx+1] << (8 - bit);
            _7bit &= 0x7F;
            _7bit += '!';
            buffer[k] = _7bit;
        }
    }
    sw.stop();
    writefln("%s ms for 10_000 iterations", sw.peek.msecs);
    writefln("Compressed size: %s", comprSize2);
    writefln("Compressed bytes: %s", cast(ubyte[])buffer[0 .. comprSize2]);
    stdout.flush();

    sw.start();
    foreach (i; 0 .. 10_000)
        id_compress(text.ptr, text.length, &comprSize);
    sw.stop();
    writefln("%s ms for 10_000 iterations", sw.peek.msecs);
    writefln("Compresed size (id_compress): %s", comprSize);
}


// Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache
enum LZ4_MEMORY_USAGE = 14;
enum LZ4_HASHLOG      = LZ4_MEMORY_USAGE-2;
enum HASH_SIZE_U32    = (1 << LZ4_MEMORY_USAGE) / uint.sizeof;
enum COPYLENGTH       = 8;
enum MINMATCH         = 4;
enum MFLIMIT          = COPYLENGTH + MINMATCH;
enum LZ4_minLength    = MFLIMIT + 1;
enum LASTLITERALS     = 5;
// Increase this value ==> compression run slower on incompressible data
enum LZ4_skipTrigger  = 6U;

enum MAXD_LOG         = 16;
enum MAX_DISTANCE     = (1 << MAXD_LOG) - 1;

enum LZ4_MAX_INPUT_SIZE = 0x7E000000; /* 2 113 929 216 bytes */
enum LZ4_64Klimit       = 64*1024 + (MFLIMIT-1);


enum tableType_t { byPtr, byU32, byU16 };


struct Token
{
    mixin(bitfields!(
        size_t, "matchLength", 4,
        size_t, "literalLength", 4,
    ));
}


struct LZ4_stream
{
    uint[HASH_SIZE_U32] hashTable;
    uint currentOffset;
    uint initCheck;
    const char* dictionary;
    uint dictSize;
}


size_t LZ4_compressBound(size_t isize)
{
    return isize > LZ4_MAX_INPUT_SIZE ? 0 : isize + (isize/255) + 16;
}


ushort LZ4_read16(in void* memPtr)
{
    ushort val16;
    memcpy(&val16, memPtr, 2);
    return val16;
}


uint LZ4_read32(in void* memPtr)
{
    uint val32;
    memcpy(&val32, memPtr, 4);
    return val32;
}


size_t LZ4_read_ARCH(in void* p)
{
    size_t result = void;
    memcpy(&result, p, (void*).sizeof);
    return result;
}


uint LZ4_hashSequence(size_t sequence, const tableType_t tableType)
{
    immutable hashLog = (tableType == tableType_t.byU16) ? LZ4_HASHLOG+1 : LZ4_HASHLOG;

    version (D_LP64)
    {
        enum prime5bytes = 889523592379UL;
        immutable hashMask = (1<<hashLog) - 1;
        return (sequence * prime5bytes >> 40 - hashLog) & hashMask;
    }
    else
    {
        return sequence * 2654435761 >> MINMATCH * 8 - hashLog;
    }
}


uint LZ4_hashPosition(in void* p, tableType_t tableType)
{
    return LZ4_hashSequence(LZ4_read_ARCH(p), tableType);
}


void LZ4_putPositionOnHash(in char* p, uint h, ref LZ4_stream tableBase,
    const tableType_t tableType, in char* srcBase)
{
    with (tableType_t) final switch (tableType)
    {
        case byPtr:
            const(char)** hashTable = cast(const(char)**)tableBase.hashTable.ptr;
            hashTable[h] = p;
            return;
        case byU32:
            uint* hashTable = cast(uint*)tableBase.hashTable.ptr;
            hashTable[h] = cast(uint)  (p-srcBase);
            return;
        case byU16:
            ushort* hashTable = cast(ushort*)tableBase.hashTable.ptr;
            hashTable[h] = cast(ushort)(p-srcBase);
            return;
    }
}


void LZ4_putPosition(in char* p, ref LZ4_stream tableBase, const tableType_t tableType,
    in char* srcBase)
{
    uint h = LZ4_hashPosition(p, tableType);
    LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase);
}


const(char*) LZ4_getPositionOnHash(uint h, ref const LZ4_stream tableBase,
    const tableType_t tableType, in char* srcBase)
{
    with (tableType_t) final switch (tableType)
    {
        case tableType_t.byPtr:
            const char** hashTable = cast(const char**)tableBase.hashTable.ptr;
            return hashTable[h];
        case tableType_t.byU32:
            uint* hashTable = cast(uint*)tableBase.hashTable.ptr;
            return hashTable[h] + srcBase;
        case tableType_t.byU16:
            ushort* hashTable = cast(ushort*)tableBase.hashTable.ptr;
            return hashTable[h] + srcBase;
    }
}


const(char*) LZ4_getPosition(in char* p, ref const LZ4_stream tableBase,
    const tableType_t tableType, in char* srcBase)
{
    uint h = LZ4_hashPosition(p, tableType);
    return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase);
}


/* customized version of memcpy, which may overwrite up to 7 bytes beyond dstEnd */
void LZ4_wildCopy(void* dstPtr, scope const(void)* srcPtr, in void* dstEnd)
{
    do
    {
        memcpy(dstPtr, srcPtr, 8);
        dstPtr += 8; srcPtr += 8;
    }
    while (dstPtr < dstEnd);
}


void LZ4_writeLE16(void* memPtr, ushort value)
{
    version (LittleEndian)
    {
        memcpy(memPtr, &value, 2);
    }
    else
    {
        ubyte* p = cast(ubyte*)memPtr;
        p[0] = cast(ubyte)value;
        p[1] = value >> 8;
    }
}


size_t LZ4_NbCommonBytes(size_t val)
{
    import core.bitop;
    version (LittleEndian)
        return bsf(val) >> 3;
    else   /* Big Endian CPU */
        return bsr(val) >> 3;
}


size_t LZ4_count(scope const(char)* pIn, scope const(char)* pMatch, in char* pInLimit)
{
    const char* pStart = pIn;
    
    while (pIn < pInLimit-(size_t.sizeof-1))
    {
        size_t diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn);
        if (!diff) { pIn+=size_t.sizeof; pMatch+=size_t.sizeof; continue; }
        pIn += LZ4_NbCommonBytes(diff);
        return pIn - pStart;
    }

    version (D_LP64) if (pIn < (pInLimit-3) && LZ4_read32(pMatch) == LZ4_read32(pIn))
    {
            pIn+=4; pMatch+=4;
    }
    if (pIn < (pInLimit-1) && LZ4_read16(pMatch) == LZ4_read16(pIn))
    {
        pIn+=2; pMatch+=2;
    }
    if (pIn < pInLimit && *pMatch == *pIn)
        pIn++;
    return pIn - pStart;
}


size_t LZ4_compress_generic(ref LZ4_stream ctx, in char* source, char* dest,
    const size_t inputSize, const tableType_t tableType)
in
{
    assert (inputSize <= LZ4_MAX_INPUT_SIZE, "Unsupported input size, too large");
    assert (tableType != tableType_t.byU16 || inputSize < LZ4_64Klimit, "Size too large (not within 64K limit)");
}
body
{
    const(char)*
        ip          = source,
        anchor      = source;
    const char*
        lowRefLimit = ip - ctx.dictSize,
        dictionary  = ctx.dictionary,
        dictEnd     = dictionary + ctx.dictSize,
        iend        = ip + inputSize,
        mflimit     = iend - MFLIMIT,
        matchlimit  = iend - LASTLITERALS;

    immutable dictDelta = dictEnd - source;
    
    char*
        op = dest;

    uint forwardH;
    size_t refDelta = 0;
    
    const char* base = source, lowLimit = source;

    /* Input too small, no compression (all literals) */
    if (inputSize < LZ4_minLength)
        goto _last_literals;
    
    /* First Byte */
    LZ4_putPosition(ip, ctx, tableType, base);
    ip++; forwardH = LZ4_hashPosition(ip, tableType);
    
    /* Main Loop */
    for ( ; ; )
    {
        const(char)* match;
        Token* token;
        {
            const(char)* forwardIp = ip;
            uint step = 1;
            uint searchMatchNb = 1 << LZ4_skipTrigger;
            
            /* Find a match */
            do
            {
                uint h = forwardH;
                ip = forwardIp;
                forwardIp += step;
                step = (searchMatchNb++ >> LZ4_skipTrigger);
                
                if (forwardIp > mflimit)
                    goto _last_literals;
                
                match = LZ4_getPositionOnHash(h, ctx, tableType, base);
                forwardH = LZ4_hashPosition(forwardIp, tableType);
                LZ4_putPositionOnHash(ip, h, ctx, tableType, base);
                
            }
            while (tableType == tableType_t.byU16
                ? (LZ4_read32(match+refDelta) != LZ4_read32(ip))
                : (match + MAX_DISTANCE < ip));
        }
        
        /* Catch up */
        while (ip > anchor && match+refDelta > lowLimit && ip[-1] == match[refDelta-1])
        {
            ip--; match--;
        }
        
        {
            /* Encode Literal length */
            size_t litLength = ip - anchor;
            token = cast(Token*)op++;
            if (litLength >= Token.literalLength_max)
            {
                token.literalLength = Token.literalLength_max;
                size_t len = litLength - Token.literalLength_max;
                for (; len >= 255 ; len -= 255)
                    *op++ = 255;
                *op++ = cast(ubyte)len;
            }
            else token.literalLength = litLength;
            
            /* Copy Literals */
            LZ4_wildCopy(op, anchor, op+litLength);
            op += litLength;
        }
        
    _next_match:
        /* Encode Offset */
        LZ4_writeLE16(op, cast(ushort)(ip-match)); op += 2;
        
        /* Encode MatchLength */
        {
            size_t matchLength = LZ4_count(ip+MINMATCH, match+MINMATCH, matchlimit);
            ip += MINMATCH + matchLength;

            if (matchLength >= Token.matchLength_max)
            {
                token.matchLength = Token.matchLength_max;
                matchLength -= Token.matchLength_max;
                for (; matchLength >= 510 ; matchLength-=510) { *op++ = 255; *op++ = 255; }
                if (matchLength >= 255) { matchLength-=255; *op++ = 255; }
                *op++ = cast(ubyte)matchLength;
            }
            else token.matchLength = cast(ubyte)matchLength;
        }
        
        anchor = ip;
        
        /* Test end of chunk */
        if (ip > mflimit) break;
        
        /* Fill table */
        LZ4_putPosition(ip-2, ctx, tableType, base);
        
        /* Test next position */
        match = LZ4_getPosition(ip, ctx, tableType, base);
        LZ4_putPosition(ip, ctx, tableType, base);
        if (match + MAX_DISTANCE >= ip && LZ4_read32(match+refDelta) == LZ4_read32(ip))
        {
            token = cast(Token*)op++;
            *token = Token.init;
            goto _next_match;
        }
        
        /* Prepare next loop */
        forwardH = LZ4_hashPosition(++ip, tableType);
    }
    
_last_literals:
    /* Encode Last Literals */
    {
        Token* token = cast(Token*)op++;
        const lastRun = iend - anchor;
        if (lastRun >= Token.literalLength_max)
        {
            token.literalLength = Token.literalLength_max;
            size_t accumulator = lastRun - Token.literalLength_max;
            for (; accumulator >= 255 ; accumulator -= 255)
                *op++ = 255;
            *op++ = cast(ubyte)accumulator;
        }
        else token.literalLength = lastRun;
        memcpy(op, anchor, lastRun);
        op += lastRun;
    }
    
    /* End */
    return op - dest;
}


version (GNU)
{
    import gcc.attribute;
    enum noinline = attribute("noinline");
}
else
    enum noinline;


@noinline
size_t LZ4_compress_fast(in char* source, char* dest, size_t inputSize)
{
    version (GNU) {} else pragma(inline, false);

    LZ4_stream state;

    if (inputSize < LZ4_64Klimit)
        return LZ4_compress_generic(state, source, dest, inputSize, tableType_t.byU16);
    else version (D_LP64)
        return LZ4_compress_generic(state, source, dest, inputSize, tableType_t.byU32);
    else
        return LZ4_compress_generic(state, source, dest, inputSize, tableType_t.byPtr);
}


// DMD CODE HERE

static int longest_match(in char* dict, size_t dlen, in char* pattern, size_t plen,
    int *pmatchoff, int *pmatchlen)
{
    int matchlen = 0;
    int matchoff;
    
    int i;
    int j;
    
    for (i = 0; i < dlen; i++)
    {
        if (dict[i] == pattern[0])
        {
            for (j = 1; 1; j++)
            {
                if (i + j == dlen || j == plen)
                    break;
                if (dict[i + j] != pattern[j])
                    break;
            }
            if (j >= matchlen)
            {
                matchlen = j;
                matchoff = i;
            }
        }
    }
    
    if (matchlen > 1)
    {
        *pmatchlen = matchlen;
        *pmatchoff = matchoff;
        return 1;                       // found a match
    }
    return 0;                           // no match
}

/******************************************
 * Compress an identifier for name mangling purposes.
 * Format is if ASCII, then it's just the char.
 * If high bit set, then it's a length/offset pair
 *
 * Params:
 *      id = string to compress
 *      idlen = length of id
 *      plen = where to store length of compressed result
 * Returns:
 *      malloc'd compressed 0-terminated identifier
 */

char* id_compress(in char* id, size_t idlen, scope size_t* plen)
{
    import core.stdc.stdlib;

    int count = 0;
    char* p = cast(char*)malloc(idlen + 1);
    for (size_t i = 0; i < idlen; i++)
    {
        int matchoff;
        int matchlen;
        
        size_t j = 0;
        if (i > 1023)
            j = i - 1023;
        
        if (longest_match(id + j, i - j, id + i, idlen - i, &matchoff, &matchlen))
        {
            matchoff += j;
            size_t off = i - matchoff;
            //printf("matchoff = %3d, matchlen = %2d, off = %d\n", matchoff, matchlen, off);
            assert(off >= matchlen);
            
            if (off <= 8 && matchlen <= 8)
            {
                p[count] = cast(char)(0xC0 | ((off - 1) << 3) | (matchlen - 1));
                count++;
                i += matchlen - 1;
                continue;
            }
            else if (matchlen > 2 && off < 1024)
            {
                if (matchlen >= 1024)
                    matchlen = 1023;    // longest representable match
                p[count + 0] = 0x80 | ((matchlen >> 4) & 0x38) | ((off >> 7) & 7);
                p[count + 1] = cast(char)(0x80 | matchlen);
                p[count + 2] = cast(char)(0x80 | off);
                count += 3;
                i += matchlen - 1;
                continue;
            }
        }
        p[count] = id[i];
        count++;
    }
    p[count] = 0;
    //printf("old size = %d, new size = %d\n", idlen, count);
    *plen = count;
    return p;
}

Re: Need a Faster Compressor

Reply via email to