Repository: commons-compress Updated Branches: refs/heads/master 5fe31efd6 -> 0557d0297
experimental API for generic compressor for LZ77-like alhorithms Right now I'm experimenting with the API as a building block for adding write support to Snappy. The hard part of LZ77 (finding matches for back-references) is generic enough to extract it. If this works out it may help us implementing compressors for Snappy or LZ4 and with quite a bit of additional work for Brotli or DEFLATE64 (which use additional encoders on top of LZ77). Project: http://git-wip-us.apache.org/repos/asf/commons-compress/repo Commit: http://git-wip-us.apache.org/repos/asf/commons-compress/commit/0557d029 Tree: http://git-wip-us.apache.org/repos/asf/commons-compress/tree/0557d029 Diff: http://git-wip-us.apache.org/repos/asf/commons-compress/diff/0557d029 Branch: refs/heads/master Commit: 0557d0297bb455570c3f8174e511b23458afe8bd Parents: 5fe31ef Author: Stefan Bodewig <[email protected]> Authored: Fri Jan 6 21:50:46 2017 +0100 Committer: Stefan Bodewig <[email protected]> Committed: Fri Jan 6 21:50:46 2017 +0100 ---------------------------------------------------------------------- .../compressors/lz77support/LZ77Compressor.java | 195 +++++++++++++++++++ .../compressors/lz77support/Parameters.java | 116 +++++++++++ .../compressors/lz77support/ParametersTest.java | 110 +++++++++++ 3 files changed, 421 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/commons-compress/blob/0557d029/src/main/java/org/apache/commons/compress/compressors/lz77support/LZ77Compressor.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/compress/compressors/lz77support/LZ77Compressor.java b/src/main/java/org/apache/commons/compress/compressors/lz77support/LZ77Compressor.java new file mode 100644 index 0000000..468da84 --- /dev/null +++ b/src/main/java/org/apache/commons/compress/compressors/lz77support/LZ77Compressor.java @@ -0,0 +1,195 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.commons.compress.compressors.lz77support; + +/** + * Helper class for compression algorithms that use the ideas of LZ77. + * + * <p>Most LZ77 derived algorithms split input data into blocks of + * uncompressed data (called literal blocks) and back-references + * (pairs of offsets and lengths) that state "add <code>length</code> + * bytes that are the same as those already written starting + * <code>offset</code> bytes before the current position. The details + * of how those blocks and back-references are encoded are quite + * different between the algorithms and some algorithms perform + * additional steps (Huffman encoding in the case of DEFLATE for + * example).</p> + * + * <p>This class attempts to extract the core logic - finding + * back-references - so it can be re-used. It follows the algorithm + * explained in section 4 of RFC 1951 (DEFLATE) and currently doesn't + * implement the "lazy match" optimization. The three-byte hash function + * used in this class is the same used by zlib and InfoZIP's ZIP + * implementation of DEFLATE.</p> + * + * <p>LZ77 is used vaguely here (as well as many other places that + * talk about it :-), LZSS would likely be closer to the truth but + * LZ77 has become the synonym for a whole family of algorithms.</p> + * + * <p>The API consists of a compressor that is fed <code>byte</code>s + * and emits {@link Block}s to a registered callback where the blocks + * represent either {@link LiteralBlock literal blocks}, {@link + * BackReference back references} or {@link EOD end of data + * markers}. In order to ensure the callback receives all information, + * the {@code #finish} method must be used once all data has been fed + * into the compressor.</p> + * + * <p>Several parameters influence the outcome of the "compression":</p> + * <dl> + * + * <dt><code>windowSize</code></dt> <dd>the size of the sliding + * window, must be a power of two - this determines the maximum + * offset a back-reference can take. The compressor maintains a + * buffer of twice of <code>windowSize</code> - real world values are + * in the area of 32k.</dd> + * + * <dt><code>minMatchSize</code></dt> + * <dd>Minimal size of a match found. A true minimum of 3 is + * hard-coded inside of this implemention but bigger sizes can be + * configured.</dd> + * + * <dt><code>maxMatchSize</code></dt> + * <dd>Maximal size of a match found.</dd> + * + * <dt><code>maxOffset</code></dt> + * <dd>Maximal offset of a back-reference.</dd> + * + * <dt><code>maxLiteralSize</code></dt> + * <dd>Maximal size of a literal block.</dd> + * </dl> + * + * @see "https://tools.ietf.org/html/rfc1951#section-4" + * @since 1.14 + * @NotThreadSafe + */ +public class LZ77Compressor { + + /** + * Base class representing things the compressor may emit. + */ + public static abstract class Block { } + /** + * Represents a literal block of data. + */ + public static final class LiteralBlock extends Block { + private final byte[] data; + private LiteralBlock(byte[] data) { + this.data = data; + } + /** + * The literal data. + * + * <p>This returns a life view of the actual data in order to + * avoid copying, modify the array at your own risk.</p> + */ + public byte[] getData() { + return data; + } + } + /** + * Represents a back-reference to a match. + */ + public static final class BackReference extends Block { + private final int offset, length; + private BackReference(int offset, int length) { + this.offset = offset; + this.length = length; + } + /** + * Provides the offset of the match. + */ + public int getOffset() { + return offset; + } + /** + * Provides the length of the match. + */ + public int getLength() { + return length; + } + } + /** + * A simple "we are done" marker. + */ + public static final class EOD extends Block { } + + /** + * Callback invoked while the compressor processes data. + * + * <p>The callback is invoked on the same thread that receives the + * bytes to compress and may be invoked multiple times during the + * execution of {@link #compress} or {@link #finish}.</p> + */ + public interface Callback /* extends Consumer<Block> */ { + void accept(Block b); + } + + private final Parameters params; + private final Callback callback; + + /** + * Initializes a compressor with parameters and a callback. + * @param params the parameters + * @param callback the callback + * @throws NullPointerException if either parameter is <code>null</code> + */ + public LZ77Compressor(Parameters params, Callback callback) { + if (params == null) { + throw new NullPointerException("params must not be null"); + } + if (callback == null) { + throw new NullPointerException("callback must not be null"); + } + this.params = params; + this.callback = callback; + } + + /** + * Feeds bytes into the compressor which in turn may emit zero or + * more blocks to the callback during the execution of this + * method. + * @param data the data to compress - must not be null + */ + public void compress(byte[] data) { + compress(data, 0, data.length); + } + + /** + * Feeds bytes into the compressor which in turn may emit zero or + * more blocks to the callback during the execution of this + * method. + * @param data the data to compress - must not be null + * @param off the start offset of the data + * @param len the number of bytes to compress + */ + public void compress(byte[] data, int off, int len) { + } + + /** + * Tells the compressor to process all remaining data and signal + * end of data to the callback. + * + * <p>The compressor will in turn emit at least one block ({@link + * EOD}) but potentially multiple blocks to the callback during + * the execution of this method.</p> + */ + public void finish() { + } + +} http://git-wip-us.apache.org/repos/asf/commons-compress/blob/0557d029/src/main/java/org/apache/commons/compress/compressors/lz77support/Parameters.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/compress/compressors/lz77support/Parameters.java b/src/main/java/org/apache/commons/compress/compressors/lz77support/Parameters.java new file mode 100644 index 0000000..84548d6 --- /dev/null +++ b/src/main/java/org/apache/commons/compress/compressors/lz77support/Parameters.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.commons.compress.compressors.lz77support; + +/** + * Parameters of the {@link LZ77Compressor compressor}. + */ +public final class Parameters { + public static final int TRUE_MIN_MATCH_SIZE = 3; + private final int windowSize, minMatchSize, maxMatchSize, maxOffset, maxLiteralSize; + + /** + * Initializes the compressor's parameters with a + * <code>minMatchSize</code> of 3 and <code>max*Size</code> + * equal to <code>windowSize</code>. + * + * @param windowSize the size of the sliding window - this + * determines the maximum offset a back-reference can take. + * @throws IllegalArgumentException if <code>windowSize</code> + * is smaller than <code>minMatchSize</code>. + */ + public Parameters(int windowSize) { + this(windowSize, TRUE_MIN_MATCH_SIZE, windowSize, windowSize, windowSize); + } + + /** + * Initializes the compressor's parameters. + * + * @param windowSize the size of the sliding window, must be a + * power of two - this determines the maximum offset a + * back-reference can take. + * @param minMatchSize the minimal size of a match found. A + * true minimum of 3 is hard-coded inside of this implemention + * but bigger sizes can be configured. + * @param maxMatchSize maximal site of a match found. A value + * smaller than <code>minMatchSize</code> is interpreted as + * infinite (actually {@link Integer.MAX_VALUE}). + * @param maxOffset maximal offset of a back-reference. A + * non-positive value is interpreted as <code>windowSize</code>. + * @param maxLiteralSize maximal size of a literal block. Negative + * numbers and 0 as well as values bigger than <code>2 * + * windowSize</code> are interpreted as <code>windowSize</code>. + * @throws IllegalArgumentException if <code>windowSize</code> is + * smaller than <code>minMatchSize</code> or not a power of two. + */ + public Parameters(int windowSize, int minMatchSize, int maxMatchSize, + int maxOffset, int maxLiteralSize) { + this.minMatchSize = Math.max(TRUE_MIN_MATCH_SIZE, minMatchSize); + if (windowSize < this.minMatchSize) { + throw new IllegalArgumentException("windowSize must be at least as big as minMatchSize"); + } + if (!isPowerOfTwo(windowSize)) { + throw new IllegalArgumentException("windowSize must be a power of two"); + } + this.windowSize = windowSize; + this.maxOffset = maxOffset < 1 ? this.windowSize + : Math.min(maxOffset, this.windowSize); + this.maxMatchSize = maxMatchSize < this.minMatchSize ? Integer.MAX_VALUE + : maxMatchSize; + this.maxLiteralSize = maxLiteralSize < 1 || maxLiteralSize > 2 * windowSize + ? windowSize : maxLiteralSize; + } + + /** + * Gets the size of the sliding window - this determines the + * maximum offset a back-reference can take. + */ + public int getWindowSize() { + return windowSize; + } + /** + * Gets the minimal size of a match found. + */ + public int getMinMatchSize() { + return minMatchSize; + } + /** + * Gets the maximal size of a match found. + */ + public int getMaxMatchSize() { + return maxMatchSize; + } + /** + * Gets the maximal offset of a match found. + */ + public int getMaxOffset() { + return maxOffset; + } + /** + * Gets the maximal size of a literal block. + */ + public int getMaxLiteralSize() { + return maxLiteralSize; + } + + private static final boolean isPowerOfTwo(int x) { + // pre-condition: x > 0 + return (x & (x - 1)) == 0; + } +} http://git-wip-us.apache.org/repos/asf/commons-compress/blob/0557d029/src/test/java/org/apache/commons/compress/compressors/lz77support/ParametersTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/commons/compress/compressors/lz77support/ParametersTest.java b/src/test/java/org/apache/commons/compress/compressors/lz77support/ParametersTest.java new file mode 100644 index 0000000..43317cc --- /dev/null +++ b/src/test/java/org/apache/commons/compress/compressors/lz77support/ParametersTest.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.commons.compress.compressors.lz77support; + +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +public class ParametersTest { + + @Test + public void defaultConstructor() { + Parameters p = new Parameters(128); + assertEquals(128, p.getWindowSize()); + assertEquals(3, p.getMinMatchSize()); + assertEquals(128, p.getMaxMatchSize()); + assertEquals(128, p.getMaxOffset()); + assertEquals(128, p.getMaxLiteralSize()); + } + + @Test + public void minMatchSizeIsAtLeastThree() { + Parameters p = new Parameters(128, 2, 3, 4, 5); + assertEquals(3, p.getMinMatchSize()); + } + + @Test + public void maxMatchSizeIsInfiniteWhenSmallerThanMinMatchSize() { + Parameters p = new Parameters(128, 2, 2, 4, 5); + assertEquals(Integer.MAX_VALUE, p.getMaxMatchSize()); + } + + @Test + public void maxMatchSizeIsMinMatchSizeIfBothAreEqual() { + Parameters p = new Parameters(128, 2, 3, 4, 5); + assertEquals(3, p.getMaxMatchSize()); + } + + @Test + public void maxOffsetIsWindowSizeIfSetTo0() { + Parameters p = new Parameters(128, 2, 3, 0, 5); + assertEquals(128, p.getMaxOffset()); + } + + @Test + public void maxOffsetIsWindowSizeIfSetToANegativeValue() { + Parameters p = new Parameters(128, 2, 3, -1, 5); + assertEquals(128, p.getMaxOffset()); + } + + @Test + public void maxOffsetIsWindowSizeIfBiggerThanWindowSize() { + Parameters p = new Parameters(128, 2, 3, 129, 5); + assertEquals(128, p.getMaxOffset()); + } + + @Test + public void maxLiteralSizeIsWindowSizeIfSetTo0() { + Parameters p = new Parameters(128, 2, 3, 4, 0); + assertEquals(128, p.getMaxLiteralSize()); + } + + @Test + public void maxLiteralSizeIsWindowSizeIfSetToANegativeValue() { + Parameters p = new Parameters(128, 2, 3, 0, -1); + assertEquals(128, p.getMaxLiteralSize()); + } + + @Test + public void maxLiteralSizeIsWindowSizeIfSetToAValueTooBigToHoldInSlidingWindow() { + Parameters p = new Parameters(128, 2, 3, 0, 259); + assertEquals(128, p.getMaxLiteralSize()); + } + + @Test + public void allParametersUsuallyTakeTheirSpecifiedValues() { + Parameters p = new Parameters(256, 4, 5, 6, 7); + assertEquals(256, p.getWindowSize()); + assertEquals(4, p.getMinMatchSize()); + assertEquals(5, p.getMaxMatchSize()); + assertEquals(6, p.getMaxOffset()); + assertEquals(7, p.getMaxLiteralSize()); + } + + @Test(expected = IllegalArgumentException.class) + public void windowSizeMustNotBeSmallerThanMinMatchSize() { + new Parameters(128, 200, 300, 400, 500); + } + + @Test(expected = IllegalArgumentException.class) + public void windowSizeMustNotBeAPowerOfTwo() { + new Parameters(100, 200, 300, 400, 500); + } +}
