This is an automated email from the ASF dual-hosted git repository. pkarwasz pushed a commit to branch feat/huffman1 in repository https://gitbox.apache.org/repos/asf/commons-compress.git
commit e155b68a9d3ef95418b6f28336db8504bd3991f5 Author: Piotr P. Karwasz <pkarwasz-git...@apache.org> AuthorDate: Tue Aug 26 23:39:28 2025 +0200 feat: enforce 20-bit Huffman code-length limit consistently This PR improves `BZip2CompressorInputStream` by enforcing the mandated Huffman code-length bound and right-sizing related tables. It aligns behavior with the reference C implementation’s intent while avoiding its historical over-allocation. ## Why The reference C code deliberately **over-sizes** its Huffman structures as a defensive programming technique: e.g., it defines `BZ_MAX_CODE_LEN = 23` and sizes length-indexed tables by `BZ_MAX_ALPHA_SIZE` rather than the code-length bound to provide safety margins. However, the bzip2 bitstream only permits code lengths ≤ 20 (and the 1.0.6 implementation effectively tightens this to 17), so carrying those defensive constants forward in our code obscures the true limit and keeps arrays [...] ## What’s changed * **Set the true limit**: `MAX_CODE_LEN` now equals **20** (the format maximum). * **Validate inputs**: code lengths are checked explicitly; any value **outside `[1, 20]`** triggers a clear exception early in decoding. * **Right-size arrays**: Huffman tables and auxiliary arrays are sized to the minimum required for the 20-bit limit, reducing footprint and clarifying invariants. * **Tests**: add a unit test that exercises the maximum alphabet size across boundary code lengths. ## Maintenance & performance * Clearer invariants (the limit you see is the limit you enforce). * Slightly lower memory usage due to smaller tables. * Easier reasoning about bounds and indexing during review and future changes. ## References * [bzip2/huffman.c](https://github.com/libarchive/bzip2/blob/master/huffman.c) * [bzip2/bzlib_private.h](https://github.com/libarchive/bzip2/blob/master/bzlib_private.h) (Reference implementation uses over-sized constants for safety but enforces an effective 20-bit maximum.) --- src/changes/changes.xml | 2 +- .../bzip2/BZip2CompressorInputStream.java | 26 +++++++++++++--------- .../compress/compressors/bzip2/BZip2Constants.java | 14 ++++++++++-- .../bzip2/BZip2CompressorInputStreamTest.java | 18 +++++++++++++++ 4 files changed, 47 insertions(+), 13 deletions(-) diff --git a/src/changes/changes.xml b/src/changes/changes.xml index c7f2cd91a..6560b378d 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -58,7 +58,7 @@ The <action> type attribute can be add,update,fix,remove. <action type="fix" dev="ggregory" due-to="Gary Gregory">Don't loose precision while reading folders from a SevenZFile.</action> <action type="fix" dev="ggregory" due-to="Roel van Dijk, Gary Gregory">Improve some exception messages in TarUtils and TarArchiveEntry.</action> <!-- FIX bzip2 --> - <action type="fix" dev="ggregory" due-to="Tyler Nighswander, Gary Gregory">org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream constructors now throws CompressorException instead of ArrayIndexOutOfBoundsException.</action> + <action type="fix" dev="pkarwasz" due-to="Tyler Nighswander, Gary Gregory">Enforce 20-bit Huffman code-length limit consistently in BZip2.</action> <action type="fix" dev="ggregory" due-to="Gary Gregory">org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream throws CompressorException (an IOException subclass) instead of IOException when it encounters formatting problems.</action> <!-- FIX dump --> <action type="fix" dev="pkarwasz" due-to="Tyler Nighswander">Align DUMP archive block size with Linux `dump` utility.</action> diff --git a/src/main/java/org/apache/commons/compress/compressors/bzip2/BZip2CompressorInputStream.java b/src/main/java/org/apache/commons/compress/compressors/bzip2/BZip2CompressorInputStream.java index ea21b3f80..68a10f717 100644 --- a/src/main/java/org/apache/commons/compress/compressors/bzip2/BZip2CompressorInputStream.java +++ b/src/main/java/org/apache/commons/compress/compressors/bzip2/BZip2CompressorInputStream.java @@ -42,7 +42,8 @@ */ public class BZip2CompressorInputStream extends CompressorInputStream implements BZip2Constants, InputStreamStatistics { - private static final class Data { + // package private for testing + static final class Data { // (with blockSize 900k) final boolean[] inUse = new boolean[256]; // 256 byte @@ -56,8 +57,10 @@ private static final class Data { */ final int[] unzftab = new int[256]; // 1024 byte - final int[][] limit = new int[N_GROUPS][MAX_ALPHA_SIZE]; // 6192 byte - final int[][] base = new int[N_GROUPS][MAX_ALPHA_SIZE]; // 6192 byte + // Needs indexes from 0 to MAX_CODE_LEN inclusive. + final int[][] limit = new int[N_GROUPS][MAX_CODE_LEN + 1]; + // Needs indexes from 0 to MAX_CODE_LEN + 1 inclusive. + final int[][] base = new int[N_GROUPS][MAX_CODE_LEN + 2]; final int[][] perm = new int[N_GROUPS][MAX_ALPHA_SIZE]; // 6192 byte final int[] minLens = new int[N_GROUPS]; // 24 byte @@ -157,7 +160,7 @@ private static void checkBounds(final int checkVal, final int limitExclusive, fi * Called by createHuffmanDecodingTables() exclusively. */ private static void hbCreateDecodeTables(final int[] limit, final int[] base, final int[] perm, final char[] length, final int minLen, final int maxLen, - final int alphaSize) throws IOException { + final int alphaSize) { for (int i = minLen, pp = 0; i <= maxLen; i++) { for (int j = 0; j < alphaSize; j++) { if (length[j] == i) { @@ -171,7 +174,6 @@ private static void hbCreateDecodeTables(final int[] limit, final int[] base, fi } for (int i = 0; i < alphaSize; i++) { final int len = length[i] + 1; - checkBounds(len, MAX_ALPHA_SIZE, "length"); base[len]++; } for (int i = 1, b = base[0]; i < MAX_CODE_LEN; i++) { @@ -298,8 +300,7 @@ private boolean complete() throws IOException { /** * Called by recvDecodingTables() exclusively. */ - private void createHuffmanDecodingTables(final int alphaSize, final int nGroups) throws IOException { - final Data dataShadow = this.data; + static void createHuffmanDecodingTables(final int alphaSize, final int nGroups, final Data dataShadow) throws IOException { final char[][] len = dataShadow.temp_charArray2d; final int[] minLens = dataShadow.minLens; final int[][] limit = dataShadow.limit; @@ -307,8 +308,8 @@ private void createHuffmanDecodingTables(final int alphaSize, final int nGroups) final int[][] perm = dataShadow.perm; for (int t = 0; t < nGroups; t++) { - int minLen = 32; - int maxLen = 0; + int minLen = MAX_CODE_LEN; + int maxLen = 1; final char[] len_t = len[t]; for (int i = alphaSize; --i >= 0;) { final char lent = len_t[i]; @@ -751,12 +752,17 @@ private void recvDecodingTables() throws IOException { while (bsGetBit(bin)) { curr += bsGetBit(bin) ? -1 : 1; } + // Same condition as in bzip2 + if (curr < 1 || curr > MAX_CODE_LEN) { + throw new CompressorException( + "Corrupted input, code length value out of range [%d, %d]: %d", 1, MAX_CODE_LEN, curr); + } len_t[i] = (char) curr; } } // finally create the Huffman tables - createHuffmanDecodingTables(alphaSize, nGroups); + createHuffmanDecodingTables(alphaSize, nGroups, dataShadow); } private int setupBlock() throws IOException { diff --git a/src/main/java/org/apache/commons/compress/compressors/bzip2/BZip2Constants.java b/src/main/java/org/apache/commons/compress/compressors/bzip2/BZip2Constants.java index 9350d2100..592b0b527 100644 --- a/src/main/java/org/apache/commons/compress/compressors/bzip2/BZip2Constants.java +++ b/src/main/java/org/apache/commons/compress/compressors/bzip2/BZip2Constants.java @@ -35,9 +35,19 @@ interface BZip2Constants { int MAX_ALPHA_SIZE = 258; /** - * Constant {@value}. + * Maximum allowed length of a Huffman code in BZip2. + * <p> + * In the original bzip2 C implementation, a defensive limit of 23 is set to prevent buffer overflows + * (see <a href="https://github.com/libarchive/bzip2/blob/master/bzlib_private.h">{@code bzip_private.h}</a>). + * However, the actual enforced maximum during decompression is 20 + * (see <a href="https://github.com/libarchive/bzip2/blob/master/decompress.c">BZ2_decompress in {@code decompress.c}</a>), + * and since version 1.0.6, compression uses a maximum of 17. + * </p> + * <p> + * The Java implementation does not require the higher defensive limit, so this constant is set to the true maximum of 20. + * </p> */ - int MAX_CODE_LEN = 23; + int MAX_CODE_LEN = 20; /** * Constant {@value}. diff --git a/src/test/java/org/apache/commons/compress/compressors/bzip2/BZip2CompressorInputStreamTest.java b/src/test/java/org/apache/commons/compress/compressors/bzip2/BZip2CompressorInputStreamTest.java index 778832668..387717009 100644 --- a/src/test/java/org/apache/commons/compress/compressors/bzip2/BZip2CompressorInputStreamTest.java +++ b/src/test/java/org/apache/commons/compress/compressors/bzip2/BZip2CompressorInputStreamTest.java @@ -18,6 +18,7 @@ */ package org.apache.commons.compress.compressors.bzip2; +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -35,6 +36,7 @@ import org.apache.commons.compress.archivers.ArchiveInputStream; import org.apache.commons.compress.archivers.ArchiveStreamFactory; import org.apache.commons.compress.compressors.CompressorException; +import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream.Data; import org.apache.commons.io.IOUtils; import org.junit.jupiter.api.Test; @@ -149,4 +151,20 @@ void testSingleByteReadConsistentlyReturnsMinusOneAtEof() throws IOException { assertEquals(-1, in.read()); } } + + @Test + void testCreateHuffmanDecodingTablesWithLargeAlphaSize() { + final Data data = new Data(1); + // Use a codeLengths array with length equal to MAX_ALPHA_SIZE (258) to test array bounds. + final char[] codeLengths = new char[258]; + for (int i = 0; i < codeLengths.length; i++) { + // Use all code lengths within valid range [1, 20] + codeLengths[i] = (char) ((i % 20) + 1); + } + data.temp_charArray2d[0] = codeLengths; + assertDoesNotThrow( + () -> BZip2CompressorInputStream.createHuffmanDecodingTables(codeLengths.length, 1, data), + "createHuffmanDecodingTables should not throw for valid codeLengths array of MAX_ALPHA_SIZE"); + assertEquals(data.minLens[0], 1, "Minimum code length should be 1"); + } }