This is an automated email from the ASF dual-hosted git repository.

pkarwasz pushed a commit to branch feat/huffman1
in repository https://gitbox.apache.org/repos/asf/commons-compress.git

commit e155b68a9d3ef95418b6f28336db8504bd3991f5
Author: Piotr P. Karwasz <pkarwasz-git...@apache.org>
AuthorDate: Tue Aug 26 23:39:28 2025 +0200

    feat: enforce 20-bit Huffman code-length limit consistently
    
    This PR improves `BZip2CompressorInputStream` by enforcing the mandated 
Huffman code-length bound and right-sizing related tables. It aligns behavior 
with the reference C implementation’s intent while avoiding its historical 
over-allocation.
    
    ## Why
    
    The reference C code deliberately **over-sizes** its Huffman structures as 
a defensive programming technique: e.g., it defines `BZ_MAX_CODE_LEN = 23` and 
sizes length-indexed tables by `BZ_MAX_ALPHA_SIZE` rather than the code-length 
bound to provide safety margins. However, the bzip2 bitstream only permits code 
lengths ≤ 20 (and the 1.0.6 implementation effectively tightens this to 17), so 
carrying those defensive constants forward in our code obscures the true limit 
and keeps arrays  [...]
    
    ## What’s changed
    
    * **Set the true limit**: `MAX_CODE_LEN` now equals **20** (the format 
maximum).
    
    * **Validate inputs**: code lengths are checked explicitly; any value 
**outside `[1, 20]`** triggers a clear exception early in decoding.
    
    * **Right-size arrays**: Huffman tables and auxiliary arrays are sized to 
the minimum required for the 20-bit limit, reducing footprint and clarifying 
invariants.
    
    * **Tests**: add a unit test that exercises the maximum alphabet size 
across boundary code lengths.
    
    ## Maintenance & performance
    
    * Clearer invariants (the limit you see is the limit you enforce).
    * Slightly lower memory usage due to smaller tables.
    * Easier reasoning about bounds and indexing during review and future 
changes.
    
    ## References
    
    * 
[bzip2/huffman.c](https://github.com/libarchive/bzip2/blob/master/huffman.c)
    * 
[bzip2/bzlib_private.h](https://github.com/libarchive/bzip2/blob/master/bzlib_private.h)
      (Reference implementation uses over-sized constants for safety but 
enforces an effective 20-bit maximum.)
---
 src/changes/changes.xml                            |  2 +-
 .../bzip2/BZip2CompressorInputStream.java          | 26 +++++++++++++---------
 .../compress/compressors/bzip2/BZip2Constants.java | 14 ++++++++++--
 .../bzip2/BZip2CompressorInputStreamTest.java      | 18 +++++++++++++++
 4 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index c7f2cd91a..6560b378d 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -58,7 +58,7 @@ The <action> type attribute can be add,update,fix,remove.
       <action type="fix" dev="ggregory" due-to="Gary Gregory">Don't loose 
precision while reading folders from a SevenZFile.</action>
       <action type="fix" dev="ggregory" due-to="Roel van Dijk, Gary 
Gregory">Improve some exception messages in TarUtils and 
TarArchiveEntry.</action>
       <!-- FIX bzip2 -->      
-      <action type="fix" dev="ggregory" due-to="Tyler Nighswander, Gary 
Gregory">org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream
 constructors now throws CompressorException instead of 
ArrayIndexOutOfBoundsException.</action>
+      <action type="fix" dev="pkarwasz" due-to="Tyler Nighswander, Gary 
Gregory">Enforce 20-bit Huffman code-length limit consistently in 
BZip2.</action>
       <action type="fix" dev="ggregory" due-to="Gary 
Gregory">org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream
 throws CompressorException (an IOException subclass) instead of IOException 
when it encounters formatting problems.</action>
       <!-- FIX dump -->      
       <action type="fix" dev="pkarwasz" due-to="Tyler Nighswander">Align DUMP 
archive block size with Linux `dump` utility.</action>
diff --git 
a/src/main/java/org/apache/commons/compress/compressors/bzip2/BZip2CompressorInputStream.java
 
b/src/main/java/org/apache/commons/compress/compressors/bzip2/BZip2CompressorInputStream.java
index ea21b3f80..68a10f717 100644
--- 
a/src/main/java/org/apache/commons/compress/compressors/bzip2/BZip2CompressorInputStream.java
+++ 
b/src/main/java/org/apache/commons/compress/compressors/bzip2/BZip2CompressorInputStream.java
@@ -42,7 +42,8 @@
  */
 public class BZip2CompressorInputStream extends CompressorInputStream 
implements BZip2Constants, InputStreamStatistics {
 
-    private static final class Data {
+    // package private for testing
+    static final class Data {
 
         // (with blockSize 900k)
         final boolean[] inUse = new boolean[256]; // 256 byte
@@ -56,8 +57,10 @@ private static final class Data {
          */
         final int[] unzftab = new int[256]; // 1024 byte
 
-        final int[][] limit = new int[N_GROUPS][MAX_ALPHA_SIZE]; // 6192 byte
-        final int[][] base = new int[N_GROUPS][MAX_ALPHA_SIZE]; // 6192 byte
+        // Needs indexes from 0 to MAX_CODE_LEN inclusive.
+        final int[][] limit = new int[N_GROUPS][MAX_CODE_LEN + 1];
+        // Needs indexes from 0 to MAX_CODE_LEN + 1 inclusive.
+        final int[][] base = new int[N_GROUPS][MAX_CODE_LEN + 2];
         final int[][] perm = new int[N_GROUPS][MAX_ALPHA_SIZE]; // 6192 byte
         final int[] minLens = new int[N_GROUPS]; // 24 byte
 
@@ -157,7 +160,7 @@ private static void checkBounds(final int checkVal, final 
int limitExclusive, fi
      * Called by createHuffmanDecodingTables() exclusively.
      */
     private static void hbCreateDecodeTables(final int[] limit, final int[] 
base, final int[] perm, final char[] length, final int minLen, final int maxLen,
-            final int alphaSize) throws IOException {
+            final int alphaSize) {
         for (int i = minLen, pp = 0; i <= maxLen; i++) {
             for (int j = 0; j < alphaSize; j++) {
                 if (length[j] == i) {
@@ -171,7 +174,6 @@ private static void hbCreateDecodeTables(final int[] limit, 
final int[] base, fi
         }
         for (int i = 0; i < alphaSize; i++) {
             final int len = length[i] + 1;
-            checkBounds(len, MAX_ALPHA_SIZE, "length");
             base[len]++;
         }
         for (int i = 1, b = base[0]; i < MAX_CODE_LEN; i++) {
@@ -298,8 +300,7 @@ private boolean complete() throws IOException {
     /**
      * Called by recvDecodingTables() exclusively.
      */
-    private void createHuffmanDecodingTables(final int alphaSize, final int 
nGroups) throws IOException {
-        final Data dataShadow = this.data;
+    static void createHuffmanDecodingTables(final int alphaSize, final int 
nGroups, final Data dataShadow) throws IOException {
         final char[][] len = dataShadow.temp_charArray2d;
         final int[] minLens = dataShadow.minLens;
         final int[][] limit = dataShadow.limit;
@@ -307,8 +308,8 @@ private void createHuffmanDecodingTables(final int 
alphaSize, final int nGroups)
         final int[][] perm = dataShadow.perm;
 
         for (int t = 0; t < nGroups; t++) {
-            int minLen = 32;
-            int maxLen = 0;
+            int minLen = MAX_CODE_LEN;
+            int maxLen = 1;
             final char[] len_t = len[t];
             for (int i = alphaSize; --i >= 0;) {
                 final char lent = len_t[i];
@@ -751,12 +752,17 @@ private void recvDecodingTables() throws IOException {
                 while (bsGetBit(bin)) {
                     curr += bsGetBit(bin) ? -1 : 1;
                 }
+                // Same condition as in bzip2
+                if (curr < 1 || curr > MAX_CODE_LEN) {
+                    throw new CompressorException(
+                            "Corrupted input, code length value out of range 
[%d, %d]: %d", 1, MAX_CODE_LEN, curr);
+                }
                 len_t[i] = (char) curr;
             }
         }
 
         // finally create the Huffman tables
-        createHuffmanDecodingTables(alphaSize, nGroups);
+        createHuffmanDecodingTables(alphaSize, nGroups, dataShadow);
     }
 
     private int setupBlock() throws IOException {
diff --git 
a/src/main/java/org/apache/commons/compress/compressors/bzip2/BZip2Constants.java
 
b/src/main/java/org/apache/commons/compress/compressors/bzip2/BZip2Constants.java
index 9350d2100..592b0b527 100644
--- 
a/src/main/java/org/apache/commons/compress/compressors/bzip2/BZip2Constants.java
+++ 
b/src/main/java/org/apache/commons/compress/compressors/bzip2/BZip2Constants.java
@@ -35,9 +35,19 @@ interface BZip2Constants {
     int MAX_ALPHA_SIZE = 258;
 
     /**
-     * Constant {@value}.
+     * Maximum allowed length of a Huffman code in BZip2.
+     * <p>
+     *     In the original bzip2 C implementation, a defensive limit of 23 is 
set to prevent buffer overflows
+     *     (see <a 
href="https://github.com/libarchive/bzip2/blob/master/bzlib_private.h";>{@code 
bzip_private.h}</a>).
+     *     However, the actual enforced maximum during decompression is 20
+     *     (see <a 
href="https://github.com/libarchive/bzip2/blob/master/decompress.c";>BZ2_decompress
 in {@code decompress.c}</a>),
+     *     and since version 1.0.6, compression uses a maximum of 17.
+     * </p>
+     * <p>
+     *     The Java implementation does not require the higher defensive 
limit, so this constant is set to the true maximum of 20.
+     * </p>
      */
-    int MAX_CODE_LEN = 23;
+    int MAX_CODE_LEN = 20;
 
     /**
      * Constant {@value}.
diff --git 
a/src/test/java/org/apache/commons/compress/compressors/bzip2/BZip2CompressorInputStreamTest.java
 
b/src/test/java/org/apache/commons/compress/compressors/bzip2/BZip2CompressorInputStreamTest.java
index 778832668..387717009 100644
--- 
a/src/test/java/org/apache/commons/compress/compressors/bzip2/BZip2CompressorInputStreamTest.java
+++ 
b/src/test/java/org/apache/commons/compress/compressors/bzip2/BZip2CompressorInputStreamTest.java
@@ -18,6 +18,7 @@
  */
 package org.apache.commons.compress.compressors.bzip2;
 
+import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertThrows;
 import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -35,6 +36,7 @@
 import org.apache.commons.compress.archivers.ArchiveInputStream;
 import org.apache.commons.compress.archivers.ArchiveStreamFactory;
 import org.apache.commons.compress.compressors.CompressorException;
+import 
org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream.Data;
 import org.apache.commons.io.IOUtils;
 import org.junit.jupiter.api.Test;
 
@@ -149,4 +151,20 @@ void testSingleByteReadConsistentlyReturnsMinusOneAtEof() 
throws IOException {
             assertEquals(-1, in.read());
         }
     }
+
+    @Test
+    void testCreateHuffmanDecodingTablesWithLargeAlphaSize() {
+        final Data data = new Data(1);
+        // Use a codeLengths array with length equal to MAX_ALPHA_SIZE (258) 
to test array bounds.
+        final char[] codeLengths = new char[258];
+        for (int i = 0; i < codeLengths.length; i++) {
+            // Use all code lengths within valid range [1, 20]
+            codeLengths[i] = (char) ((i % 20) + 1);
+        }
+        data.temp_charArray2d[0] = codeLengths;
+        assertDoesNotThrow(
+                () -> 
BZip2CompressorInputStream.createHuffmanDecodingTables(codeLengths.length, 1, 
data),
+                "createHuffmanDecodingTables should not throw for valid 
codeLengths array of MAX_ALPHA_SIZE");
+        assertEquals(data.minLens[0], 1, "Minimum code length should be 1");
+    }
 }

Reply via email to