Repository: commons-compress
Updated Branches:
  refs/heads/master 0557d0297 -> 1c6d40067


implement infrastructure and "no references to find" case


Project: http://git-wip-us.apache.org/repos/asf/commons-compress/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-compress/commit/1c6d4006
Tree: http://git-wip-us.apache.org/repos/asf/commons-compress/tree/1c6d4006
Diff: http://git-wip-us.apache.org/repos/asf/commons-compress/diff/1c6d4006

Branch: refs/heads/master
Commit: 1c6d400675e2cc853a6dcbbf2b7f318b6eb4e702
Parents: 0557d02
Author: Stefan Bodewig <[email protected]>
Authored: Sat Jan 7 10:47:33 2017 +0100
Committer: Stefan Bodewig <[email protected]>
Committed: Sat Jan 7 10:47:33 2017 +0100

----------------------------------------------------------------------
 .../compressors/lz77support/LZ77Compressor.java | 216 ++++++++++++++++++-
 .../compressors/lz77support/Parameters.java     |   2 +-
 .../lz77support/LZ77CompressorTest.java         | 114 ++++++++++
 3 files changed, 327 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/commons-compress/blob/1c6d4006/src/main/java/org/apache/commons/compress/compressors/lz77support/LZ77Compressor.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/commons/compress/compressors/lz77support/LZ77Compressor.java
 
b/src/main/java/org/apache/commons/compress/compressors/lz77support/LZ77Compressor.java
index 468da84..e4afdf1 100644
--- 
a/src/main/java/org/apache/commons/compress/compressors/lz77support/LZ77Compressor.java
+++ 
b/src/main/java/org/apache/commons/compress/compressors/lz77support/LZ77Compressor.java
@@ -34,9 +34,10 @@ package org.apache.commons.compress.compressors.lz77support;
  * <p>This class attempts to extract the core logic - finding
  * back-references - so it can be re-used. It follows the algorithm
  * explained in section 4 of RFC 1951 (DEFLATE) and currently doesn't
- * implement the "lazy match" optimization. The three-byte hash function
- * used in this class is the same used by zlib and InfoZIP's ZIP
- * implementation of DEFLATE.</p>
+ * implement the "lazy match" optimization. The three-byte hash
+ * function used in this class is the same used by zlib and InfoZIP's
+ * ZIP implementation of DEFLATE. Strongly inspired by InfoZIP's
+ * implementation.</p>
  *
  * <p>LZ77 is used vaguely here (as well as many other places that
  * talk about it :-), LZSS would likely be closer to the truth but
@@ -86,11 +87,19 @@ public class LZ77Compressor {
     public static abstract class Block { }
     /**
      * Represents a literal block of data.
+     *
+     * <p>For performance reasons this encapsulates the real data, not
+     * a copy of it. Don't modify the data and process it inside of
+     * {@link Callback#accept} immediately as it will get overwritten
+     * sooner or later.</p>
      */
     public static final class LiteralBlock extends Block {
         private final byte[] data;
-        private LiteralBlock(byte[] data) {
+        private final int offset, length;
+        /* package private for tests */ LiteralBlock(byte[] data, int offset, 
int length) {
             this.data = data;
+            this.offset = offset;
+            this.length = length;
         }
         /**
          * The literal data.
@@ -101,6 +110,23 @@ public class LZ77Compressor {
         public byte[] getData() {
             return data;
         }
+        /**
+         * Offset into data where the literal block starts.
+         */
+        public int getOffset() {
+            return offset;
+        }
+        /**
+         * Length of literal block.
+         */
+        public int getLength() {
+            return length;
+        }
+
+        @Override
+        public String toString() {
+            return "LiteralBlock starting at " + offset + " with length " + 
length;
+        }
     }
     /**
      * Represents a back-reference to a match.
@@ -123,12 +149,19 @@ public class LZ77Compressor {
         public int getLength() {
             return length;
         }
+
+        @Override
+        public String toString() {
+            return "BackReference with " + offset + " and length " + length;
+        }
     }
     /**
      * A simple "we are done" marker.
      */
     public static final class EOD extends Block { }
 
+    private static final EOD THE_EOD = new EOD();
+
     /**
      * Callback invoked while the compressor processes data.
      *
@@ -140,9 +173,41 @@ public class LZ77Compressor {
         void accept(Block b);
     }
 
+    static final int NUMBER_OF_BYTES_IN_HASH = 3;
+    private static final int NO_MATCH = -1;
+
     private final Parameters params;
     private final Callback callback;
 
+    // the sliding window, twice as big as "windowSize" parameter
+    private final byte[] window;
+    // the head of hash-chain - indexed by hash-code, points to the
+    // location inside of window of the latest sequence of bytes with
+    // the given hash.
+    private final int[] head;
+    // for each window-location points to the latest earlier location
+    // with the same hash. Only stored values for the latest
+    // "windowSize" elements, the index is "window location modulo
+    // windowSize".
+    private final int[] prev;
+
+    // bit mask used when indexing into prev
+    private final int wMask;
+
+    private boolean initialized = false;
+    // the position inside of window that shall be encoded right now
+    private int currentPosition;
+    // the number of bytes available to compress including the one at
+    // currentPosition
+    private int lookahead = 0;
+    // the hash of the three bytes stating at the current position
+    private int insertHash = 0;
+    // the position inside of the window where the current literal
+    // block starts (in case we are inside of a literal block).
+    private int blockStart = 0;
+    // position of the current match
+    private int matchStart = NO_MATCH;
+
     /**
      * Initializes a compressor with parameters and a callback.
      * @param params the parameters
@@ -158,6 +223,15 @@ public class LZ77Compressor {
         }
         this.params = params;
         this.callback = callback;
+
+        final int wSize = params.getWindowSize();
+        window = new byte[wSize * 2];
+        wMask = wSize - 1;
+        head = new int[HASH_SIZE];
+        for (int i = 0; i < HASH_SIZE; i++) {
+            head[i] = NO_MATCH;
+        }
+        prev = new int[wSize];
     }
 
     /**
@@ -179,6 +253,15 @@ public class LZ77Compressor {
      * @param len the number of bytes to compress
      */
     public void compress(byte[] data, int off, int len) {
+        final int wSize = params.getWindowSize();
+        while (len > wSize) {
+            doCompress(data, off, wSize);
+            off += wSize;
+            len -= wSize;
+        }
+        if (len > 0) {
+            doCompress(data, off, len);
+        }
     }
 
     /**
@@ -190,6 +273,131 @@ public class LZ77Compressor {
      * the execution of this method.</p>
      */
     public void finish() {
+        if (blockStart != currentPosition || lookahead > 0) {
+            currentPosition += lookahead;
+            flushLiteralBlock();
+        }
+        callback.accept(THE_EOD);
+    }
+
+    // we use a 15 bit hashcode as calculated in updateHash
+    private static final int HASH_SIZE = 1 << 15;
+    private static final int HASH_MASK = HASH_SIZE - 1;
+    private static final int H_SHIFT = 5;
+
+    /**
+     * Assumes we are calculating the hash for three consecutive bytes
+     * as a rolling hash, i.e. for bytes ABCD if H is the hash of ABC
+     * the new hash for BCD is nextHash(H, D).
+     *
+     * <p>The hash is shifted by five bits on each update so all
+     * effects of A have been swapped after the third update.</p>
+     */
+    private int nextHash(int oldHash, byte nextByte) {
+        final int nextVal = nextByte & 0xFF;
+        return ((oldHash << H_SHIFT) ^ nextVal) & HASH_MASK;
+    }
+
+    // performs the actual algorithm with the pre-condition len <= windowSize
+    private void doCompress(byte[] data, int off, int len) {
+        int spaceLeft = window.length - currentPosition - lookahead;
+        if (len > spaceLeft) {
+            slide();
+        }
+        System.arraycopy(data, off, window, currentPosition + lookahead, len);
+        lookahead += len;
+        if (!initialized && lookahead >= params.getMinMatchSize()) {
+            initialize();
+        }
+        if (initialized) {
+            compress();
+        }
+    }
+
+    private void slide() {
+        final int wSize = params.getWindowSize();
+        System.arraycopy(window, wSize, window, 0, wSize);
+        currentPosition -= wSize;
+        matchStart -= wSize;
+        blockStart -= wSize;
+        for (int i = 0; i< HASH_SIZE; i++) {
+            int h = head[i];
+            head[i] = h >= wSize ? h - wSize : NO_MATCH;
+        }
+        for (int i = 0; i < wSize; i++) {
+            int p = prev[i];
+            prev[i] = p >= wSize ? p - wSize : NO_MATCH;
+        }
+    }
+
+    private void initialize() {
+        for (int i = 0; i < NUMBER_OF_BYTES_IN_HASH - 1; i++) {
+            insertHash = nextHash(insertHash, window[i]);
+        }
+        initialized = true;
+    }
+
+    private void compress() {
+        final int minMatch = params.getMinMatchSize();
+
+        while (lookahead >= minMatch) {
+            int matchLength = 0;
+            int hashHead = insertString();
+            if (hashHead != NO_MATCH && hashHead - currentPosition <= 
params.getMaxOffset()) {
+                // sets matchStart as a side effect
+                matchLength = longestMatch(hashHead);
+            }
+            if (matchLength >= minMatch) {
+                if (blockStart != currentPosition) {
+                    // emit preceeding literal block
+                    flushLiteralBlock();
+                    blockStart = NO_MATCH;
+                }
+                lookahead -= matchLength;
+                // inserts strings contained in current match
+                for (int i = 0; i < matchLength - 1; i++) {
+                    currentPosition++;
+                    insertString();
+                }
+                currentPosition++;
+                flushBackReference(matchLength);
+                blockStart = currentPosition;
+            } else {
+                // no match, append to current or start a new literal
+                lookahead--;
+                currentPosition++;
+                if (currentPosition - blockStart >= 
params.getMaxLiteralSize()) {
+                    flushLiteralBlock();
+                    blockStart = currentPosition;
+                }
+            }
+        }
     }
 
+    /**
+     * Inserts the current three byte sequence into the dictionary and
+     * returns the previous previous head of the hash-chain.
+     *
+     * <p>Updates <code>insertHash</code> and <code>prev</code> as a
+     * side effect.</p>
+     */
+    private int insertString() {
+        insertHash = nextHash(insertHash, window[currentPosition -1 + 
NUMBER_OF_BYTES_IN_HASH]);
+        int hashHead = head[insertHash];
+        prev[currentPosition & wMask] = hashHead;
+        head[insertHash] = currentPosition;
+        return hashHead;
+    }
+
+    private void flushBackReference(int matchLength) {
+        callback.accept(new BackReference(matchStart, matchLength));
+    }
+
+    private void flushLiteralBlock() {
+        callback.accept(new LiteralBlock(window, blockStart, currentPosition - 
blockStart));
+    }
+
+    private int longestMatch(int matchHead) {
+        return 0;
+    }
 }

http://git-wip-us.apache.org/repos/asf/commons-compress/blob/1c6d4006/src/main/java/org/apache/commons/compress/compressors/lz77support/Parameters.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/commons/compress/compressors/lz77support/Parameters.java
 
b/src/main/java/org/apache/commons/compress/compressors/lz77support/Parameters.java
index 84548d6..6f2b2cb 100644
--- 
a/src/main/java/org/apache/commons/compress/compressors/lz77support/Parameters.java
+++ 
b/src/main/java/org/apache/commons/compress/compressors/lz77support/Parameters.java
@@ -22,7 +22,7 @@ package org.apache.commons.compress.compressors.lz77support;
  * Parameters of the {@link LZ77Compressor compressor}.
  */
 public final class Parameters {
-    public static final int TRUE_MIN_MATCH_SIZE = 3;
+    public static final int TRUE_MIN_MATCH_SIZE = 
LZ77Compressor.NUMBER_OF_BYTES_IN_HASH;
     private final int windowSize, minMatchSize, maxMatchSize, maxOffset, 
maxLiteralSize;
 
     /**

http://git-wip-us.apache.org/repos/asf/commons-compress/blob/1c6d4006/src/test/java/org/apache/commons/compress/compressors/lz77support/LZ77CompressorTest.java
----------------------------------------------------------------------
diff --git 
a/src/test/java/org/apache/commons/compress/compressors/lz77support/LZ77CompressorTest.java
 
b/src/test/java/org/apache/commons/compress/compressors/lz77support/LZ77CompressorTest.java
new file mode 100644
index 0000000..202c5da
--- /dev/null
+++ 
b/src/test/java/org/apache/commons/compress/compressors/lz77support/LZ77CompressorTest.java
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.commons.compress.compressors.lz77support;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.junit.Test;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+
+public class LZ77CompressorTest {
+
+    private List<LZ77Compressor.Block> compress(Parameters params, byte[]... 
chunks) {
+        final List<LZ77Compressor.Block> blocks = new ArrayList<>();
+        LZ77Compressor c = new LZ77Compressor(params, new 
LZ77Compressor.Callback() {
+                @Override
+                public void accept(LZ77Compressor.Block block) {
+                    if (block instanceof LZ77Compressor.LiteralBlock) {
+                        // replace with a real copy of data so tests
+                        // can see the results as they've been when
+                        // the callback has been called
+                        LZ77Compressor.LiteralBlock b = 
(LZ77Compressor.LiteralBlock) block;
+                        int len = b.getLength();
+                        block = new LZ77Compressor.LiteralBlock(
+                            Arrays.copyOfRange(b.getData(), b.getOffset(), 
b.getOffset() + len),
+                            0, len);
+                    }
+                    blocks.add(block);
+                }
+            });
+        for (byte[] chunk : chunks) {
+            c.compress(chunk);
+        }
+        c.finish();
+        return blocks;
+    }
+
+    @Test
+    public void nonCompressableWithLengthSmallerThanLiteralMax() {
+        byte[] data = new byte[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
+        List<LZ77Compressor.Block> blocks = compress(new Parameters(128), 
data);
+        assertEquals(2, blocks.size());
+        assertEquals(LZ77Compressor.LiteralBlock.class, 
blocks.get(0).getClass());
+        assertArrayEquals(data, ((LZ77Compressor.LiteralBlock) 
blocks.get(0)).getData());
+        assertEquals(LZ77Compressor.EOD.class, blocks.get(1).getClass());
+    }
+
+    @Test
+    public void 
nonCompressableWithLengthGreaterThanLiteralMaxButLessThanTwiceWindowSize() {
+        byte[] data = new byte[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
+        List<LZ77Compressor.Block> blocks = compress(new Parameters(8), data);
+        assertEquals(3, blocks.size());
+        assertEquals(LZ77Compressor.LiteralBlock.class, 
blocks.get(0).getClass());
+        assertArrayEquals(new byte[] { 1, 2, 3, 4, 5, 6, 7, 8 },
+            ((LZ77Compressor.LiteralBlock) blocks.get(0)).getData());
+        assertEquals(LZ77Compressor.LiteralBlock.class, 
blocks.get(1).getClass());
+        assertArrayEquals(new byte[] { 9, 10 },
+            ((LZ77Compressor.LiteralBlock) blocks.get(1)).getData());
+        assertEquals(LZ77Compressor.EOD.class, blocks.get(2).getClass());
+    }
+
+    @Test
+    public void nonCompressableWithLengthThatForcesWindowSlide() {
+        byte[] data = new byte[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
+        List<LZ77Compressor.Block> blocks = compress(new Parameters(4), data);
+        assertEquals(4, blocks.size());
+        assertEquals(LZ77Compressor.LiteralBlock.class, 
blocks.get(0).getClass());
+        assertArrayEquals(new byte[] { 1, 2, 3, 4 },
+            ((LZ77Compressor.LiteralBlock) blocks.get(0)).getData());
+        assertEquals(LZ77Compressor.LiteralBlock.class, 
blocks.get(1).getClass());
+        assertArrayEquals(new byte[] { 5, 6, 7, 8 },
+            ((LZ77Compressor.LiteralBlock) blocks.get(1)).getData());
+        assertEquals(LZ77Compressor.LiteralBlock.class, 
blocks.get(2).getClass());
+        assertArrayEquals(new byte[] { 9, 10 },
+            ((LZ77Compressor.LiteralBlock) blocks.get(2)).getData());
+        assertEquals(LZ77Compressor.EOD.class, blocks.get(3).getClass());
+    }
+
+    @Test
+    public void nonCompressableSentAsSingleBytes() {
+        List<LZ77Compressor.Block> blocks = compress(new Parameters(8), new 
byte[][] {
+                { 1 }, { 2 }, { 3 } , { 4 }, { 5 },
+                { 6 }, { 7 }, { 8 } , { 9 }, { 10 },
+            });
+        assertEquals(3, blocks.size());
+        assertEquals(LZ77Compressor.LiteralBlock.class, 
blocks.get(0).getClass());
+        assertArrayEquals(new byte[] { 1, 2, 3, 4, 5, 6, 7, 8 },
+            ((LZ77Compressor.LiteralBlock) blocks.get(0)).getData());
+        assertEquals(LZ77Compressor.LiteralBlock.class, 
blocks.get(1).getClass());
+        assertArrayEquals(new byte[] { 9, 10 },
+            ((LZ77Compressor.LiteralBlock) blocks.get(1)).getData());
+        assertEquals(LZ77Compressor.EOD.class, blocks.get(2).getClass());
+    }
+
+}

Reply via email to