Repository: commons-compress Updated Branches: refs/heads/master 0557d0297 -> 1c6d40067
implement infrastructure and "no references to find" case Project: http://git-wip-us.apache.org/repos/asf/commons-compress/repo Commit: http://git-wip-us.apache.org/repos/asf/commons-compress/commit/1c6d4006 Tree: http://git-wip-us.apache.org/repos/asf/commons-compress/tree/1c6d4006 Diff: http://git-wip-us.apache.org/repos/asf/commons-compress/diff/1c6d4006 Branch: refs/heads/master Commit: 1c6d400675e2cc853a6dcbbf2b7f318b6eb4e702 Parents: 0557d02 Author: Stefan Bodewig <[email protected]> Authored: Sat Jan 7 10:47:33 2017 +0100 Committer: Stefan Bodewig <[email protected]> Committed: Sat Jan 7 10:47:33 2017 +0100 ---------------------------------------------------------------------- .../compressors/lz77support/LZ77Compressor.java | 216 ++++++++++++++++++- .../compressors/lz77support/Parameters.java | 2 +- .../lz77support/LZ77CompressorTest.java | 114 ++++++++++ 3 files changed, 327 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/commons-compress/blob/1c6d4006/src/main/java/org/apache/commons/compress/compressors/lz77support/LZ77Compressor.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/compress/compressors/lz77support/LZ77Compressor.java b/src/main/java/org/apache/commons/compress/compressors/lz77support/LZ77Compressor.java index 468da84..e4afdf1 100644 --- a/src/main/java/org/apache/commons/compress/compressors/lz77support/LZ77Compressor.java +++ b/src/main/java/org/apache/commons/compress/compressors/lz77support/LZ77Compressor.java @@ -34,9 +34,10 @@ package org.apache.commons.compress.compressors.lz77support; * <p>This class attempts to extract the core logic - finding * back-references - so it can be re-used. It follows the algorithm * explained in section 4 of RFC 1951 (DEFLATE) and currently doesn't - * implement the "lazy match" optimization. The three-byte hash function - * used in this class is the same used by zlib and InfoZIP's ZIP - * implementation of DEFLATE.</p> + * implement the "lazy match" optimization. The three-byte hash + * function used in this class is the same used by zlib and InfoZIP's + * ZIP implementation of DEFLATE. Strongly inspired by InfoZIP's + * implementation.</p> * * <p>LZ77 is used vaguely here (as well as many other places that * talk about it :-), LZSS would likely be closer to the truth but @@ -86,11 +87,19 @@ public class LZ77Compressor { public static abstract class Block { } /** * Represents a literal block of data. + * + * <p>For performance reasons this encapsulates the real data, not + * a copy of it. Don't modify the data and process it inside of + * {@link Callback#accept} immediately as it will get overwritten + * sooner or later.</p> */ public static final class LiteralBlock extends Block { private final byte[] data; - private LiteralBlock(byte[] data) { + private final int offset, length; + /* package private for tests */ LiteralBlock(byte[] data, int offset, int length) { this.data = data; + this.offset = offset; + this.length = length; } /** * The literal data. @@ -101,6 +110,23 @@ public class LZ77Compressor { public byte[] getData() { return data; } + /** + * Offset into data where the literal block starts. + */ + public int getOffset() { + return offset; + } + /** + * Length of literal block. + */ + public int getLength() { + return length; + } + + @Override + public String toString() { + return "LiteralBlock starting at " + offset + " with length " + length; + } } /** * Represents a back-reference to a match. @@ -123,12 +149,19 @@ public class LZ77Compressor { public int getLength() { return length; } + + @Override + public String toString() { + return "BackReference with " + offset + " and length " + length; + } } /** * A simple "we are done" marker. */ public static final class EOD extends Block { } + private static final EOD THE_EOD = new EOD(); + /** * Callback invoked while the compressor processes data. * @@ -140,9 +173,41 @@ public class LZ77Compressor { void accept(Block b); } + static final int NUMBER_OF_BYTES_IN_HASH = 3; + private static final int NO_MATCH = -1; + private final Parameters params; private final Callback callback; + // the sliding window, twice as big as "windowSize" parameter + private final byte[] window; + // the head of hash-chain - indexed by hash-code, points to the + // location inside of window of the latest sequence of bytes with + // the given hash. + private final int[] head; + // for each window-location points to the latest earlier location + // with the same hash. Only stored values for the latest + // "windowSize" elements, the index is "window location modulo + // windowSize". + private final int[] prev; + + // bit mask used when indexing into prev + private final int wMask; + + private boolean initialized = false; + // the position inside of window that shall be encoded right now + private int currentPosition; + // the number of bytes available to compress including the one at + // currentPosition + private int lookahead = 0; + // the hash of the three bytes stating at the current position + private int insertHash = 0; + // the position inside of the window where the current literal + // block starts (in case we are inside of a literal block). + private int blockStart = 0; + // position of the current match + private int matchStart = NO_MATCH; + /** * Initializes a compressor with parameters and a callback. * @param params the parameters @@ -158,6 +223,15 @@ public class LZ77Compressor { } this.params = params; this.callback = callback; + + final int wSize = params.getWindowSize(); + window = new byte[wSize * 2]; + wMask = wSize - 1; + head = new int[HASH_SIZE]; + for (int i = 0; i < HASH_SIZE; i++) { + head[i] = NO_MATCH; + } + prev = new int[wSize]; } /** @@ -179,6 +253,15 @@ public class LZ77Compressor { * @param len the number of bytes to compress */ public void compress(byte[] data, int off, int len) { + final int wSize = params.getWindowSize(); + while (len > wSize) { + doCompress(data, off, wSize); + off += wSize; + len -= wSize; + } + if (len > 0) { + doCompress(data, off, len); + } } /** @@ -190,6 +273,131 @@ public class LZ77Compressor { * the execution of this method.</p> */ public void finish() { + if (blockStart != currentPosition || lookahead > 0) { + currentPosition += lookahead; + flushLiteralBlock(); + } + callback.accept(THE_EOD); + } + + // we use a 15 bit hashcode as calculated in updateHash + private static final int HASH_SIZE = 1 << 15; + private static final int HASH_MASK = HASH_SIZE - 1; + private static final int H_SHIFT = 5; + + /** + * Assumes we are calculating the hash for three consecutive bytes + * as a rolling hash, i.e. for bytes ABCD if H is the hash of ABC + * the new hash for BCD is nextHash(H, D). + * + * <p>The hash is shifted by five bits on each update so all + * effects of A have been swapped after the third update.</p> + */ + private int nextHash(int oldHash, byte nextByte) { + final int nextVal = nextByte & 0xFF; + return ((oldHash << H_SHIFT) ^ nextVal) & HASH_MASK; + } + + // performs the actual algorithm with the pre-condition len <= windowSize + private void doCompress(byte[] data, int off, int len) { + int spaceLeft = window.length - currentPosition - lookahead; + if (len > spaceLeft) { + slide(); + } + System.arraycopy(data, off, window, currentPosition + lookahead, len); + lookahead += len; + if (!initialized && lookahead >= params.getMinMatchSize()) { + initialize(); + } + if (initialized) { + compress(); + } + } + + private void slide() { + final int wSize = params.getWindowSize(); + System.arraycopy(window, wSize, window, 0, wSize); + currentPosition -= wSize; + matchStart -= wSize; + blockStart -= wSize; + for (int i = 0; i< HASH_SIZE; i++) { + int h = head[i]; + head[i] = h >= wSize ? h - wSize : NO_MATCH; + } + for (int i = 0; i < wSize; i++) { + int p = prev[i]; + prev[i] = p >= wSize ? p - wSize : NO_MATCH; + } + } + + private void initialize() { + for (int i = 0; i < NUMBER_OF_BYTES_IN_HASH - 1; i++) { + insertHash = nextHash(insertHash, window[i]); + } + initialized = true; + } + + private void compress() { + final int minMatch = params.getMinMatchSize(); + + while (lookahead >= minMatch) { + int matchLength = 0; + int hashHead = insertString(); + if (hashHead != NO_MATCH && hashHead - currentPosition <= params.getMaxOffset()) { + // sets matchStart as a side effect + matchLength = longestMatch(hashHead); + } + if (matchLength >= minMatch) { + if (blockStart != currentPosition) { + // emit preceeding literal block + flushLiteralBlock(); + blockStart = NO_MATCH; + } + lookahead -= matchLength; + // inserts strings contained in current match + for (int i = 0; i < matchLength - 1; i++) { + currentPosition++; + insertString(); + } + currentPosition++; + flushBackReference(matchLength); + blockStart = currentPosition; + } else { + // no match, append to current or start a new literal + lookahead--; + currentPosition++; + if (currentPosition - blockStart >= params.getMaxLiteralSize()) { + flushLiteralBlock(); + blockStart = currentPosition; + } + } + } } + /** + * Inserts the current three byte sequence into the dictionary and + * returns the previous previous head of the hash-chain. + * + * <p>Updates <code>insertHash</code> and <code>prev</code> as a + * side effect.</p> + */ + private int insertString() { + insertHash = nextHash(insertHash, window[currentPosition -1 + NUMBER_OF_BYTES_IN_HASH]); + int hashHead = head[insertHash]; + prev[currentPosition & wMask] = hashHead; + head[insertHash] = currentPosition; + return hashHead; + } + + private void flushBackReference(int matchLength) { + callback.accept(new BackReference(matchStart, matchLength)); + } + + private void flushLiteralBlock() { + callback.accept(new LiteralBlock(window, blockStart, currentPosition - blockStart)); + } + + private int longestMatch(int matchHead) { + return 0; + } } http://git-wip-us.apache.org/repos/asf/commons-compress/blob/1c6d4006/src/main/java/org/apache/commons/compress/compressors/lz77support/Parameters.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/compress/compressors/lz77support/Parameters.java b/src/main/java/org/apache/commons/compress/compressors/lz77support/Parameters.java index 84548d6..6f2b2cb 100644 --- a/src/main/java/org/apache/commons/compress/compressors/lz77support/Parameters.java +++ b/src/main/java/org/apache/commons/compress/compressors/lz77support/Parameters.java @@ -22,7 +22,7 @@ package org.apache.commons.compress.compressors.lz77support; * Parameters of the {@link LZ77Compressor compressor}. */ public final class Parameters { - public static final int TRUE_MIN_MATCH_SIZE = 3; + public static final int TRUE_MIN_MATCH_SIZE = LZ77Compressor.NUMBER_OF_BYTES_IN_HASH; private final int windowSize, minMatchSize, maxMatchSize, maxOffset, maxLiteralSize; /** http://git-wip-us.apache.org/repos/asf/commons-compress/blob/1c6d4006/src/test/java/org/apache/commons/compress/compressors/lz77support/LZ77CompressorTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/commons/compress/compressors/lz77support/LZ77CompressorTest.java b/src/test/java/org/apache/commons/compress/compressors/lz77support/LZ77CompressorTest.java new file mode 100644 index 0000000..202c5da --- /dev/null +++ b/src/test/java/org/apache/commons/compress/compressors/lz77support/LZ77CompressorTest.java @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.commons.compress.compressors.lz77support; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.junit.Test; + +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; + +public class LZ77CompressorTest { + + private List<LZ77Compressor.Block> compress(Parameters params, byte[]... chunks) { + final List<LZ77Compressor.Block> blocks = new ArrayList<>(); + LZ77Compressor c = new LZ77Compressor(params, new LZ77Compressor.Callback() { + @Override + public void accept(LZ77Compressor.Block block) { + if (block instanceof LZ77Compressor.LiteralBlock) { + // replace with a real copy of data so tests + // can see the results as they've been when + // the callback has been called + LZ77Compressor.LiteralBlock b = (LZ77Compressor.LiteralBlock) block; + int len = b.getLength(); + block = new LZ77Compressor.LiteralBlock( + Arrays.copyOfRange(b.getData(), b.getOffset(), b.getOffset() + len), + 0, len); + } + blocks.add(block); + } + }); + for (byte[] chunk : chunks) { + c.compress(chunk); + } + c.finish(); + return blocks; + } + + @Test + public void nonCompressableWithLengthSmallerThanLiteralMax() { + byte[] data = new byte[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }; + List<LZ77Compressor.Block> blocks = compress(new Parameters(128), data); + assertEquals(2, blocks.size()); + assertEquals(LZ77Compressor.LiteralBlock.class, blocks.get(0).getClass()); + assertArrayEquals(data, ((LZ77Compressor.LiteralBlock) blocks.get(0)).getData()); + assertEquals(LZ77Compressor.EOD.class, blocks.get(1).getClass()); + } + + @Test + public void nonCompressableWithLengthGreaterThanLiteralMaxButLessThanTwiceWindowSize() { + byte[] data = new byte[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }; + List<LZ77Compressor.Block> blocks = compress(new Parameters(8), data); + assertEquals(3, blocks.size()); + assertEquals(LZ77Compressor.LiteralBlock.class, blocks.get(0).getClass()); + assertArrayEquals(new byte[] { 1, 2, 3, 4, 5, 6, 7, 8 }, + ((LZ77Compressor.LiteralBlock) blocks.get(0)).getData()); + assertEquals(LZ77Compressor.LiteralBlock.class, blocks.get(1).getClass()); + assertArrayEquals(new byte[] { 9, 10 }, + ((LZ77Compressor.LiteralBlock) blocks.get(1)).getData()); + assertEquals(LZ77Compressor.EOD.class, blocks.get(2).getClass()); + } + + @Test + public void nonCompressableWithLengthThatForcesWindowSlide() { + byte[] data = new byte[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }; + List<LZ77Compressor.Block> blocks = compress(new Parameters(4), data); + assertEquals(4, blocks.size()); + assertEquals(LZ77Compressor.LiteralBlock.class, blocks.get(0).getClass()); + assertArrayEquals(new byte[] { 1, 2, 3, 4 }, + ((LZ77Compressor.LiteralBlock) blocks.get(0)).getData()); + assertEquals(LZ77Compressor.LiteralBlock.class, blocks.get(1).getClass()); + assertArrayEquals(new byte[] { 5, 6, 7, 8 }, + ((LZ77Compressor.LiteralBlock) blocks.get(1)).getData()); + assertEquals(LZ77Compressor.LiteralBlock.class, blocks.get(2).getClass()); + assertArrayEquals(new byte[] { 9, 10 }, + ((LZ77Compressor.LiteralBlock) blocks.get(2)).getData()); + assertEquals(LZ77Compressor.EOD.class, blocks.get(3).getClass()); + } + + @Test + public void nonCompressableSentAsSingleBytes() { + List<LZ77Compressor.Block> blocks = compress(new Parameters(8), new byte[][] { + { 1 }, { 2 }, { 3 } , { 4 }, { 5 }, + { 6 }, { 7 }, { 8 } , { 9 }, { 10 }, + }); + assertEquals(3, blocks.size()); + assertEquals(LZ77Compressor.LiteralBlock.class, blocks.get(0).getClass()); + assertArrayEquals(new byte[] { 1, 2, 3, 4, 5, 6, 7, 8 }, + ((LZ77Compressor.LiteralBlock) blocks.get(0)).getData()); + assertEquals(LZ77Compressor.LiteralBlock.class, blocks.get(1).getClass()); + assertArrayEquals(new byte[] { 9, 10 }, + ((LZ77Compressor.LiteralBlock) blocks.get(1)).getData()); + assertEquals(LZ77Compressor.EOD.class, blocks.get(2).getClass()); + } + +}
