Repository: commons-compress Updated Branches: refs/heads/master 1c6d40067 -> eba864c6f
implement back-reference detection Project: http://git-wip-us.apache.org/repos/asf/commons-compress/repo Commit: http://git-wip-us.apache.org/repos/asf/commons-compress/commit/aa81e37f Tree: http://git-wip-us.apache.org/repos/asf/commons-compress/tree/aa81e37f Diff: http://git-wip-us.apache.org/repos/asf/commons-compress/diff/aa81e37f Branch: refs/heads/master Commit: aa81e37f23e03b8aec4bc287bc84778c2ffc86c3 Parents: 1c6d400 Author: Stefan Bodewig <[email protected]> Authored: Sat Jan 7 18:05:10 2017 +0100 Committer: Stefan Bodewig <[email protected]> Committed: Sat Jan 7 18:05:10 2017 +0100 ---------------------------------------------------------------------- .../compressors/lz77support/LZ77Compressor.java | 68 +++++-- .../lz77support/LZ77CompressorTest.java | 195 +++++++++++++++---- 2 files changed, 209 insertions(+), 54 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/commons-compress/blob/aa81e37f/src/main/java/org/apache/commons/compress/compressors/lz77support/LZ77Compressor.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/compress/compressors/lz77support/LZ77Compressor.java b/src/main/java/org/apache/commons/compress/compressors/lz77support/LZ77Compressor.java index e4afdf1..2a6fe3d 100644 --- a/src/main/java/org/apache/commons/compress/compressors/lz77support/LZ77Compressor.java +++ b/src/main/java/org/apache/commons/compress/compressors/lz77support/LZ77Compressor.java @@ -207,6 +207,8 @@ public class LZ77Compressor { private int blockStart = 0; // position of the current match private int matchStart = NO_MATCH; + // number of insertString calls for the up to three last bytes of the last match + private int missedInserts = 0; /** * Initializes a compressor with parameters and a callback. @@ -341,8 +343,9 @@ public class LZ77Compressor { final int minMatch = params.getMinMatchSize(); while (lookahead >= minMatch) { + catchUpMissedInserts(); int matchLength = 0; - int hashHead = insertString(); + int hashHead = insertString(currentPosition); if (hashHead != NO_MATCH && hashHead - currentPosition <= params.getMaxOffset()) { // sets matchStart as a side effect matchLength = longestMatch(hashHead); @@ -353,14 +356,10 @@ public class LZ77Compressor { flushLiteralBlock(); blockStart = NO_MATCH; } - lookahead -= matchLength; - // inserts strings contained in current match - for (int i = 0; i < matchLength - 1; i++) { - currentPosition++; - insertString(); - } - currentPosition++; flushBackReference(matchLength); + insertStringsInMatch(matchLength); + lookahead -= matchLength; + currentPosition += matchLength; blockStart = currentPosition; } else { // no match, append to current or start a new literal @@ -381,23 +380,66 @@ public class LZ77Compressor { * <p>Updates <code>insertHash</code> and <code>prev</code> as a * side effect.</p> */ - private int insertString() { - insertHash = nextHash(insertHash, window[currentPosition -1 + NUMBER_OF_BYTES_IN_HASH]); + private int insertString(int pos) { + insertHash = nextHash(insertHash, window[pos - 1 + NUMBER_OF_BYTES_IN_HASH]); int hashHead = head[insertHash]; prev[currentPosition & wMask] = hashHead; - head[insertHash] = currentPosition; + head[insertHash] = pos; return hashHead; } + private void insertStringsInMatch(int matchLength) { + // inserts strings contained in current match + // insertString inserts the byte 2 bytes after position, which may not yet be available -> missedInserts + final int stop = Math.min(matchLength - 1, lookahead - NUMBER_OF_BYTES_IN_HASH); + // currentPosition has been inserted already + for (int i = 1; i <= stop; i++) { + insertString(currentPosition + i); + } + missedInserts = matchLength - stop - 1; + } + + private void catchUpMissedInserts() { + while (missedInserts > 0) { + insertString(currentPosition - missedInserts--); + } + } + private void flushBackReference(int matchLength) { - callback.accept(new BackReference(matchStart, matchLength)); + callback.accept(new BackReference(currentPosition - matchStart, matchLength)); } private void flushLiteralBlock() { callback.accept(new LiteralBlock(window, blockStart, currentPosition - blockStart)); } + /** + * Searches the hash chain for real matches and returns the length + * of the longest match (0 if none were found) that isn't too far + * away (WRT maxOffset). + * + * <p>Sets matchStart to the index of the start position of the + * longest match as a side effect.</p> + */ private int longestMatch(int matchHead) { - return 0; + final int minLength = params.getMinMatchSize(); + int longestMatchLength = minLength - 1; + final int maxPossibleLength = Math.min(params.getMaxMatchSize(), lookahead); + final int minIndex = Math.max(0, currentPosition - params.getMaxOffset()); + while (matchHead >= minIndex) { + int currentLength = 0; + for (int i = 0; i < maxPossibleLength; i++) { + if (window[matchHead + i] != window[currentPosition + i]) { + break; + } + currentLength++; + } + if (currentLength > longestMatchLength) { + longestMatchLength = currentLength; + matchStart = matchHead; + } + matchHead = prev[matchHead & wMask]; + } + return longestMatchLength; // < minLength if no matches have been found, will be ignored in compress() } } http://git-wip-us.apache.org/repos/asf/commons-compress/blob/aa81e37f/src/test/java/org/apache/commons/compress/compressors/lz77support/LZ77CompressorTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/commons/compress/compressors/lz77support/LZ77CompressorTest.java b/src/test/java/org/apache/commons/compress/compressors/lz77support/LZ77CompressorTest.java index 202c5da..049a085 100644 --- a/src/test/java/org/apache/commons/compress/compressors/lz77support/LZ77CompressorTest.java +++ b/src/test/java/org/apache/commons/compress/compressors/lz77support/LZ77CompressorTest.java @@ -18,6 +18,7 @@ */ package org.apache.commons.compress.compressors.lz77support; +import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -29,11 +30,46 @@ import static org.junit.Assert.assertEquals; public class LZ77CompressorTest { + private static final byte[] BLA, SAM, ONE_TO_TEN; + + static { + try { + /* + * Example from "An Explanation of the Deflate Algorithm" by "Antaeus Feldspar". + * @see "http://zlib.net/feldspar.html" + */ + BLA = "Blah blah blah blah blah!".getBytes("ASCII"); + + /* + * Example from Wikipedia article about LZSS. + * Note the example uses indices instead of offsets. + * @see "https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Storer%E2%80%93Szymanski" + */ + SAM = ("I am Sam\n" + + "\n" + + "Sam I am\n" + + "\n" + + "That Sam-I-am!\n" + + "That Sam-I-am!\n" + + "I do not like\n" + + "that Sam-I-am!\n" + + "\n" + + "Do you like green eggs and ham?\n" + + "\n" + + "I do not like them, Sam-I-am.\n" + + "I do not like green eggs and ham.").getBytes("ASCII"); + } catch (UnsupportedEncodingException ex) { + throw new RuntimeException("ASCII not supported"); + } + ONE_TO_TEN = new byte[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }; + } + private List<LZ77Compressor.Block> compress(Parameters params, byte[]... chunks) { final List<LZ77Compressor.Block> blocks = new ArrayList<>(); LZ77Compressor c = new LZ77Compressor(params, new LZ77Compressor.Callback() { @Override public void accept(LZ77Compressor.Block block) { + //System.err.println(block); if (block instanceof LZ77Compressor.LiteralBlock) { // replace with a real copy of data so tests // can see the results as they've been when @@ -56,59 +92,136 @@ public class LZ77CompressorTest { @Test public void nonCompressableWithLengthSmallerThanLiteralMax() { - byte[] data = new byte[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }; - List<LZ77Compressor.Block> blocks = compress(new Parameters(128), data); - assertEquals(2, blocks.size()); - assertEquals(LZ77Compressor.LiteralBlock.class, blocks.get(0).getClass()); - assertArrayEquals(data, ((LZ77Compressor.LiteralBlock) blocks.get(0)).getData()); - assertEquals(LZ77Compressor.EOD.class, blocks.get(1).getClass()); + List<LZ77Compressor.Block> blocks = compress(new Parameters(128), ONE_TO_TEN); + assertSize(2, blocks); + assertLiteralBlock(ONE_TO_TEN, blocks.get(0)); } @Test public void nonCompressableWithLengthGreaterThanLiteralMaxButLessThanTwiceWindowSize() { - byte[] data = new byte[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }; - List<LZ77Compressor.Block> blocks = compress(new Parameters(8), data); - assertEquals(3, blocks.size()); - assertEquals(LZ77Compressor.LiteralBlock.class, blocks.get(0).getClass()); - assertArrayEquals(new byte[] { 1, 2, 3, 4, 5, 6, 7, 8 }, - ((LZ77Compressor.LiteralBlock) blocks.get(0)).getData()); - assertEquals(LZ77Compressor.LiteralBlock.class, blocks.get(1).getClass()); - assertArrayEquals(new byte[] { 9, 10 }, - ((LZ77Compressor.LiteralBlock) blocks.get(1)).getData()); - assertEquals(LZ77Compressor.EOD.class, blocks.get(2).getClass()); + List<LZ77Compressor.Block> blocks = compress(new Parameters(8), ONE_TO_TEN); + assertSize(3, blocks); + assertLiteralBlock(new byte[] { 1, 2, 3, 4, 5, 6, 7, 8 }, blocks.get(0)); + assertLiteralBlock(new byte[] { 9, 10 }, blocks.get(1)); } @Test public void nonCompressableWithLengthThatForcesWindowSlide() { - byte[] data = new byte[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }; - List<LZ77Compressor.Block> blocks = compress(new Parameters(4), data); - assertEquals(4, blocks.size()); - assertEquals(LZ77Compressor.LiteralBlock.class, blocks.get(0).getClass()); - assertArrayEquals(new byte[] { 1, 2, 3, 4 }, - ((LZ77Compressor.LiteralBlock) blocks.get(0)).getData()); - assertEquals(LZ77Compressor.LiteralBlock.class, blocks.get(1).getClass()); - assertArrayEquals(new byte[] { 5, 6, 7, 8 }, - ((LZ77Compressor.LiteralBlock) blocks.get(1)).getData()); - assertEquals(LZ77Compressor.LiteralBlock.class, blocks.get(2).getClass()); - assertArrayEquals(new byte[] { 9, 10 }, - ((LZ77Compressor.LiteralBlock) blocks.get(2)).getData()); - assertEquals(LZ77Compressor.EOD.class, blocks.get(3).getClass()); + List<LZ77Compressor.Block> blocks = compress(new Parameters(4), ONE_TO_TEN); + assertSize(4, blocks); + assertLiteralBlock(new byte[] { 1, 2, 3, 4, }, blocks.get(0)); + assertLiteralBlock(new byte[] { 5, 6, 7, 8 }, blocks.get(1)); + assertLiteralBlock(new byte[] { 9, 10 }, blocks.get(2)); } @Test public void nonCompressableSentAsSingleBytes() { - List<LZ77Compressor.Block> blocks = compress(new Parameters(8), new byte[][] { - { 1 }, { 2 }, { 3 } , { 4 }, { 5 }, - { 6 }, { 7 }, { 8 } , { 9 }, { 10 }, - }); - assertEquals(3, blocks.size()); - assertEquals(LZ77Compressor.LiteralBlock.class, blocks.get(0).getClass()); - assertArrayEquals(new byte[] { 1, 2, 3, 4, 5, 6, 7, 8 }, - ((LZ77Compressor.LiteralBlock) blocks.get(0)).getData()); - assertEquals(LZ77Compressor.LiteralBlock.class, blocks.get(1).getClass()); - assertArrayEquals(new byte[] { 9, 10 }, - ((LZ77Compressor.LiteralBlock) blocks.get(1)).getData()); - assertEquals(LZ77Compressor.EOD.class, blocks.get(2).getClass()); + List<LZ77Compressor.Block> blocks = compress(new Parameters(8), stagger(ONE_TO_TEN)); + assertSize(3, blocks); + assertLiteralBlock(new byte[] { 1, 2, 3, 4, 5, 6, 7, 8 }, blocks.get(0)); + assertLiteralBlock(new byte[] { 9, 10 }, blocks.get(1)); + } + + @Test + public void blaExampleWithFullArrayAvailableForCompression() + throws UnsupportedEncodingException { + List<LZ77Compressor.Block> blocks = compress(new Parameters(128), BLA); + assertSize(4, blocks); + assertLiteralBlock("Blah b", blocks.get(0)); + assertBackReference(5, 18, blocks.get(1)); + assertLiteralBlock("!", blocks.get(2)); + } + + @Test + public void blaExampleWithShorterMatchLength() throws UnsupportedEncodingException { + List<LZ77Compressor.Block> blocks = compress(new Parameters(128, 3, 5, 0, 0), BLA); + assertSize(7, blocks); + assertLiteralBlock("Blah b", blocks.get(0)); + assertBackReference(5, 5, blocks.get(1)); + assertBackReference(5, 5, blocks.get(2)); + assertBackReference(5, 5, blocks.get(3)); + assertBackReference(5, 3, blocks.get(4)); + assertLiteralBlock("!", blocks.get(5)); } + @Test + public void blaExampleSmallerWindowSize() throws UnsupportedEncodingException { + List<LZ77Compressor.Block> blocks = compress(new Parameters(8), BLA); + assertSize(5, blocks); + assertLiteralBlock("Blah b", blocks.get(0)); + assertEquals(LZ77Compressor.BackReference.class, blocks.get(1).getClass()); + assertBackReference(5, 8, blocks.get(1)); + assertBackReference(5, 8, blocks.get(2)); + assertLiteralBlock("ah!", blocks.get(3)); + } + + @Test + public void blaExampleWithSingleByteWrites() throws UnsupportedEncodingException { + List<LZ77Compressor.Block> blocks = compress(new Parameters(128), stagger(BLA)); + assertEquals(9, blocks.size()); + assertLiteralBlock("Blah b", blocks.get(0)); + assertBackReference(5, 3, blocks.get(1)); + assertBackReference(5, 3, blocks.get(2)); + assertBackReference(5, 3, blocks.get(3)); + assertBackReference(5, 3, blocks.get(4)); + assertBackReference(5, 3, blocks.get(5)); + assertBackReference(5, 3, blocks.get(6)); + assertLiteralBlock("!", blocks.get(7)); + } + + @Test + public void samIAmExampleWithFullArrayAvailableForCompression() throws UnsupportedEncodingException { + List<LZ77Compressor.Block> blocks = compress(new Parameters(1024), SAM); + assertEquals(21, blocks.size()); + assertLiteralBlock("I am Sam\n\n", blocks.get(0)); + assertBackReference(5, 3, blocks.get(1)); + assertLiteralBlock(" ", blocks.get(2)); + assertBackReference(14, 4, blocks.get(3)); + assertLiteralBlock("\n\nThat", blocks.get(4)); + assertBackReference(20, 4, blocks.get(5)); + assertLiteralBlock("-I-am!", blocks.get(6)); + assertBackReference(15, 16, blocks.get(7)); + assertLiteralBlock("I do not like\nt", blocks.get(8)); + assertBackReference(29, 14, blocks.get(9)); + assertLiteralBlock("\nDo you", blocks.get(10)); + assertBackReference(28, 5, blocks.get(11)); + assertLiteralBlock(" green eggs and ham?\n", blocks.get(12)); + assertBackReference(63, 14, blocks.get(13)); + assertLiteralBlock(" them,", blocks.get(14)); + assertBackReference(64, 9, blocks.get(15)); + assertLiteralBlock(".", blocks.get(16)); + assertBackReference(30, 15, blocks.get(17)); + assertBackReference(65, 18, blocks.get(18)); + assertLiteralBlock(".", blocks.get(19)); + } + + private static final void assertSize(int expectedSize, List<LZ77Compressor.Block> blocks) { + assertEquals(expectedSize, blocks.size()); + assertEquals(LZ77Compressor.EOD.class, blocks.get(expectedSize - 1).getClass()); + } + + private static final void assertLiteralBlock(String expectedContent, LZ77Compressor.Block block) + throws UnsupportedEncodingException { + assertLiteralBlock(expectedContent.getBytes("ASCII"), block); + } + + private static final void assertLiteralBlock(byte[] expectedContent, LZ77Compressor.Block block) { + assertEquals(LZ77Compressor.LiteralBlock.class, block.getClass()); + assertArrayEquals(expectedContent, ((LZ77Compressor.LiteralBlock) block).getData()); + } + + private static final void assertBackReference(int expectedOffset, int expectedLength, LZ77Compressor.Block block) { + assertEquals(LZ77Compressor.BackReference.class, block.getClass()); + LZ77Compressor.BackReference b = (LZ77Compressor.BackReference) block; + assertEquals(expectedOffset, b.getOffset()); + assertEquals(expectedLength, b.getLength()); + } + + private static final byte[][] stagger(byte[] data) { + byte[][] r = new byte[data.length][1]; + for (int i = 0; i < data.length; i++) { + r[i][0] = data[i]; + } + return r; + } }
