fkjellberg commented on code in PR #690: URL: https://github.com/apache/commons-compress/pull/690#discussion_r2310720423
########## src/main/java/org/apache/commons/compress/compressors/lha/AbstractLhStaticHuffmanCompressorInputStream.java: ########## @@ -0,0 +1,344 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.commons.compress.compressors.lha; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.ByteOrder; + +import org.apache.commons.compress.compressors.CompressorException; +import org.apache.commons.compress.compressors.CompressorInputStream; +import org.apache.commons.compress.utils.BitInputStream; +import org.apache.commons.compress.utils.CircularBuffer; +import org.apache.commons.compress.utils.InputStreamStatistics; +import org.apache.commons.io.input.CloseShieldInputStream; + +/** + * This is an implementation of a static Huffman compressor input stream for LHA files that + * supports lh4, lh5, lh6 and lh7 compression methods. + * + * This implementation is based on the documentation that can be found at + * https://github.com/jca02266/lha/blob/master/Hacking_of_LHa + */ +abstract class AbstractLhStaticHuffmanCompressorInputStream extends CompressorInputStream implements InputStreamStatistics { + // Constants for command tree decoding + private static final int COMMAND_DECODING_LENGTH_BITS = 5; // Number of bits used to encode the command decoding tree length + private static final int MAX_NUMBER_OF_COMMAND_DECODING_CODE_LENGTHS = 19; // Maximum number of codes in the command decoding tree + + // Constants for command tree + private static final int COMMAND_TREE_LENGTH_BITS = 9; // Number of bits used to encode the command tree length + + // Constants for code length + private static final int CODE_LENGTH_BITS = 3; // Number of bits used to encode the code length + private static final int MAX_CODE_LENGTH = 16; + + private BitInputStream bin; + private CircularBuffer buffer; + private int blockSize; + private BinaryTree commandTree; // Command is either a literal or a copy command + private BinaryTree distanceTree; // Distance is the offset to copy from the sliding dictionary + + /** + * Constructs a new CompressorInputStream which decompresses bytes read from the specified stream. + * + * @param in the InputStream from which to read compressed data + * @throws IOException if an I/O error occurs + */ + AbstractLhStaticHuffmanCompressorInputStream(final InputStream in) throws IOException { + this.bin = new BitInputStream(in == System.in ? CloseShieldInputStream.wrap(in) : in, ByteOrder.BIG_ENDIAN); + + // Create a sliding dictionary buffer that can hold the full dictionary size and the maximum match length + this.buffer = new CircularBuffer(getDictionarySize() + getMaxMatchLength()); + } + + @Override + public void close() throws IOException { + if (this.bin != null) { + try { + this.bin.close(); + } finally { + this.bin = null; + this.buffer = null; + this.blockSize = -1; + } + } + } + + /** + * Get the threshold for copying data from the sliding dictionary. This is the minimum + * possible number of bytes that will be part of a copy command. + * + * @return the copy threshold + */ + protected int getCopyThreshold() { + return 3; + } + + /** + * Get the number of bits used for the dictionary size. + * + * @return the number of bits used for the dictionary size + */ + protected abstract int getDictionaryBits(); + + /** + * Get the size of the dictionary. + * + * @return the size of the dictionary + */ + protected int getDictionarySize() { + return 1 << getDictionaryBits(); + } + + /** + * Get the number of bits used for the distance. + * + * @return the number of bits used for the distance + */ + protected abstract int getDistanceBits(); + + protected int getDistanceCodeSize() { + return getDictionaryBits() + 1; + } + + /** + * Get the maximum match length for the copy command. + * + * @return the maximum match length + */ + protected int getMaxMatchLength() { + return 256; + } + + /** + * Get the maximum number of commands in the command tree. + * This is 256 literals (0-255) and 254 copy lengths combinations (3-256). + * + * @return the maximum number of commands + */ + protected int getMaxNumberOfCommands() { + return 256 + getMaxMatchLength() - getCopyThreshold() + 1; + } + + @Override + public long getCompressedCount() { + return bin.getBytesRead(); + } + + @Override + public int read() throws IOException { + if (!buffer.available()) { + // Nothing in the buffer, try to fill it + fillBuffer(); + } + + final int ret = buffer.get(); + count(ret < 0 ? 0 : 1); // Increment input stream statistics + return ret; + } + + /** + * Fill the sliding dictionary with more data. + * + * @throws IOException if an I/O error occurs + */ + private void fillBuffer() throws IOException { + if (this.blockSize == -1) { + // End of stream + return; + } else if (this.blockSize == 0) { + // Start to read the next block + + // Read the block size (number of commands to read) + this.blockSize = (int) bin.readBits(16); + if (this.blockSize == -1) { + // End of stream + return; + } + + final BinaryTree commandDecodingTree = readCommandDecodingTree(); + + this.commandTree = readCommandTree(commandDecodingTree); + + this.distanceTree = readDistanceTree(); + } + + this.blockSize--; + + final int command = commandTree.read(bin); + if (command < 0x100) { + // Literal command, just write the byte to the buffer + buffer.put(command); + } else { + // Copy command, read the distance and calculate the length from the command + final int distance = readDistance(); + final int length = command - 0x100 + getCopyThreshold(); + + // Copy the data from the sliding dictionary and add to the buffer + buffer.copy(distance + 1, length); + } + } + + /** + * Read the command decoding tree. The command decoding tree is used when reading the command tree + * which is then actually used to decode the commands (literals or copy commands). + * + * @return the command decoding tree + * @throws IOException if an I/O error occurs + */ + protected BinaryTree readCommandDecodingTree() throws IOException { + // Number of code lengths to read + final int numCodeLengths = (int) bin.readBits(COMMAND_DECODING_LENGTH_BITS); + + if (numCodeLengths > MAX_NUMBER_OF_COMMAND_DECODING_CODE_LENGTHS) { + throw new CompressorException("Code length table has invalid size (%d > %d)", numCodeLengths, MAX_NUMBER_OF_COMMAND_DECODING_CODE_LENGTHS); + } else if (numCodeLengths == 0) { + // If numCodeLengths is zero, we read a single code length of COMMAND_DECODING_LENGTH_BITS bits and use as root of the tree + return new BinaryTree(new int[] { (int) bin.readBits(COMMAND_DECODING_LENGTH_BITS) }); + } else { + // Read all code lengths + final int[] codeLengths = new int[numCodeLengths]; + for (int index = 0; index < numCodeLengths; index++) { + codeLengths[index] = readCodeLength(); + + if (index == 2) { + // After reading the first three code lengths, we read a 2-bit skip range + index += (int) bin.readBits(2); + } + } + + return new BinaryTree(codeLengths); + } + } + + /** + * Read code length (depth in tree). Usually 0-7 but could be higher and if so, + * count the number of following consecutive one bits and add to the length. + * + * @return code length + * @throws IOException if an I/O error occurs + */ + protected int readCodeLength() throws IOException { + int len = (int) bin.readBits(CODE_LENGTH_BITS); + if (len == 0x07) { + // Count the number of following consecutive one bits + while (bin.readBit() == 1) { + if (len == MAX_CODE_LENGTH) { + throw new CompressorException("Code length overflow"); + } + + len++; + } + } + + return len; + } + + /** + * Read the command tree which is used to decode the commands (literals or copy commands). + * + * @param commandDecodingTree the Huffman tree used to decode the command lengths + * @return the command tree + * @throws IOException if an I/O error occurs + */ + protected BinaryTree readCommandTree(final BinaryTree commandDecodingTree) throws IOException { + final int numCodeLengths = (int) bin.readBits(COMMAND_TREE_LENGTH_BITS); + + if (numCodeLengths > getMaxNumberOfCommands()) { + throw new CompressorException("Code length table has invalid size (%d > %d)", numCodeLengths, getMaxNumberOfCommands()); + } else if (numCodeLengths == 0) { + // If numCodeLengths is zero, we read a single code length of COMMAND_TREE_LENGTH_BITS bits and use as root of the tree + return new BinaryTree(new int[] { (int) bin.readBits(COMMAND_TREE_LENGTH_BITS) }); + } else { + // Read all code lengths + final int[] codeLengths = new int[numCodeLengths]; + + for (int index = 0; index < numCodeLengths;) { + final int codeOrSkipRange = commandDecodingTree.read(bin); + + if (codeOrSkipRange == 0) { + // Skip one code length + index++; + } else if (codeOrSkipRange == 1) { + // Skip a range of code lengths, read 4 bits to determine how many to skip + index += (int) bin.readBits(4) + 3; + } else if (codeOrSkipRange == 2) { + // Skip a range of code lengths, read 9 bits to determine how many to skip + index += (int) bin.readBits(9) + 20; + } else { + // Subtract 2 from the codeOrSkipRange to get the code length + codeLengths[index++] = codeOrSkipRange - 2; Review Comment: I do check for EOF when `codeOrSkipRange` returns -1 above? What am I missing? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@commons.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org