(commons-compress) 01/01: ARJ: strict header validation and `selfExtracting` option

pkarwasz Fri, 17 Oct 2025 13:33:17 -0700

This is an automated email from the ASF dual-hosted git repository.

pkarwasz pushed a commit to branch feat/arj-strict-validation
in repository https://gitbox.apache.org/repos/asf/commons-compress.git


commit 83fe7fd90e5a41cc8c2be6095422404ba33ce4fa
Author: Piotr P. Karwasz <[email protected]>
AuthorDate: Mon Oct 6 18:00:57 2025 +0200

    ARJ: strict header validation and `selfExtracting` option
    
    Today `ArjArchiveInputStream` keeps scanning past invalid headers, assuming 
self-extracting stubs. That can hide corruption.
    
    This PR:
    
    * Introduces a `selfExtracting` ARJ archive option (default **false**).
    
      * **false:** no scanning; parse strictly from the first byte. Any 
invalid/truncated header fails fast.
      * **true:** scan only to locate the Main Archive Header (AMH), then 
switch to **strict mode**. All subsequent headers must be contiguous and valid.
    
    **Behavioral change**
    
    Previously, we might “skip over” bad data. Now we **only** allow a 
discovery scan for AMH (when opted in); everything after must validate or fail.
---
 src/changes/changes.xml                            |   1 +
 .../archivers/arj/ArjArchiveInputStream.java       | 179 ++++++++++++++-------
 .../archivers/arj/ArjArchiveInputStreamTest.java   |  78 +++++++--
 3 files changed, 184 insertions(+), 74 deletions(-)

diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index 9e68ae23d..e19fb0b2f 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -81,6 +81,7 @@ The <action> type attribute can be add,update,fix,remove.
       <action type="fix" dev="pkarwasz" due-to="Tyler Nighswander">Simplify 
handling of special AR records in ArArchiveInputStream.</action>
       <!-- FIX arj -->
       <action type="fix" dev="pkarwasz" due-to="Piotr P. Karwasz">Correct byte 
accounting and truncation errors in ARJ input stream.</action>
+      <action type="fix" dev="pkarwasz" due-to="Piotr P. Karwasz">Add strict 
header validation in ARJ input stream and `selfExtracting` option.</action>
       <!-- FIX unpack200 -->      
       <action type="fix" dev="ggregory" due-to="Gary Gregory, Stanislav 
Fort">org.apache.commons.compress.harmony.unpack200 now throws 
Pack200Exception, IllegalArgumentException, and IllegalStateException instead 
of other runtime exceptions and Error.</action>
       <!-- FIX pack200 -->      
diff --git 
a/src/main/java/org/apache/commons/compress/archivers/arj/ArjArchiveInputStream.java
 
b/src/main/java/org/apache/commons/compress/archivers/arj/ArjArchiveInputStream.java
index 7068231be..6c76575dd 100644
--- 
a/src/main/java/org/apache/commons/compress/archivers/arj/ArjArchiveInputStream.java
+++ 
b/src/main/java/org/apache/commons/compress/archivers/arj/ArjArchiveInputStream.java
@@ -62,10 +62,34 @@ public class ArjArchiveInputStream extends 
ArchiveInputStream<ArjArchiveEntry> {
      */
     public static final class Builder extends 
AbstractArchiveBuilder<ArjArchiveInputStream, Builder> {
 
+        private boolean selfExtracting;
+
         private Builder() {
             setCharset(ENCODING_NAME);
         }
 
+        /**
+         * Enables compatibility with self-extracting (SFX) ARJ files.
+         *
+         * <p>When {@code true}, the stream is scanned forward to locate the 
first
+         * valid ARJ main header. All bytes before that point are ignored, 
which
+         * allows reading ARJ data embedded in an executable stub.</p>
+         *
+         * <p><strong>Caveat:</strong> this lenient pre-scan can mask 
corruption that
+         * would otherwise be reported at the start of a normal {@code .arj} 
file.
+         * Enable only when you expect an SFX input.</p>
+         *
+         * <p>Default: {@code false}.</p>
+         *
+         * @param selfExtracting {@code true} if the input stream is for a 
self-extracting archive
+         * @return {@code this} instance
+         * @since 1.29.0
+         */
+        public Builder setSelfExtracting(final boolean selfExtracting) {
+            this.selfExtracting = selfExtracting;
+            return asThis();
+        }
+
         @Override
         public ArjArchiveInputStream get() throws IOException {
             return new ArjArchiveInputStream(this);
@@ -75,6 +99,7 @@ public ArjArchiveInputStream get() throws IOException {
     private static final String ENCODING_NAME = "CP437";
     private static final int ARJ_MAGIC_1 = 0x60;
     private static final int ARJ_MAGIC_2 = 0xEA;
+    private static final int MIN_FIRST_HEADER_SIZE = 30;
 
     /**
      * Creates a new builder.
@@ -118,7 +143,7 @@ public ArjArchiveInputStream(final InputStream inputStream) 
throws ArchiveExcept
     private ArjArchiveInputStream(final InputStream inputStream, final Builder 
builder) throws ArchiveException {
         super(inputStream, builder);
         try {
-            mainHeader = readMainHeader();
+            mainHeader = readMainHeader(builder.selfExtracting);
             if ((mainHeader.arjFlags & MainHeader.Flags.GARBLED) != 0) {
                 throw new ArchiveException("Encrypted ARJ files are 
unsupported");
             }
@@ -231,49 +256,31 @@ public int read(final byte[] b, final int off, final int 
len) throws IOException
         return currentInputStream.read(b, off, len);
     }
 
-    private static int readUnsignedByte(InputStream in) throws IOException {
-        final int value = in.read();
-        if (value == -1) {
-            throw new EOFException();
-        }
-        return value & 0xff;
-    }
-
-    private int readSwappedUnsignedShort() throws IOException {
-        final int value = EndianUtils.readSwappedUnsignedShort(in);
-        count(2);
-        return value;
-    }
-
-    private int readUnsignedByte() throws IOException {
-        final int value = readUnsignedByte(in);
-        count(1);
-        return value & 0xff;
-    }
-
-    private static void readExtraData(final int firstHeaderSize, final 
InputStream firstHeader, final LocalFileHeader localFileHeader) throws 
IOException {
-        if (firstHeaderSize >= 33) {
-            localFileHeader.extendedFilePosition = 
EndianUtils.readSwappedInteger(firstHeader);
-            if (firstHeaderSize >= 45) {
-                localFileHeader.dateTimeAccessed = 
EndianUtils.readSwappedInteger(firstHeader);
-                localFileHeader.dateTimeCreated = 
EndianUtils.readSwappedInteger(firstHeader);
-                localFileHeader.originalSizeEvenForVolumes = 
EndianUtils.readSwappedInteger(firstHeader);
-            }
-        }
+    /**
+     * Verifies the CRC32 checksum of the given data against the next four 
bytes read from the input stream.
+     *
+     * @param data The data to verify.
+     * @return true if the checksum matches, false otherwise.
+     * @throws EOFException If the end of the stream is reached before reading 
the checksum.
+     * @throws IOException If an I/O error occurs.
+     */
+    @SuppressWarnings("Since15")
+    private boolean checkCRC32(final byte[] data) throws IOException {
+        final CRC32 crc32 = new CRC32();
+        crc32.update(data);
+        final long expectedCrc32 = readSwappedUnsignedInteger();
+        return crc32.getValue() == expectedCrc32;
     }
 
     /**
      * Scans for the next valid ARJ header.
      *
-     * @return The header bytes, or {@code null} if end of archive.
+     * @return The header bytes.
      * @throws EOFException If the end of the stream is reached before a valid 
header is found.
      * @throws IOException If an I/O error occurs.
      */
-    private byte[] readHeader() throws IOException {
+    private byte[] findMainHeader() throws IOException {
         byte[] basicHeaderBytes;
-        // TODO: Explain why we are scanning for a valid ARJ header
-        //       and don't throw, when an invalid/corrupted header is found,
-        //       which might indicate a corrupted archive.
         while (true) {
             int first;
             int second = readUnsignedByte();
@@ -282,23 +289,69 @@ private byte[] readHeader() throws IOException {
                 second = readUnsignedByte();
             } while (first != ARJ_MAGIC_1 && second != ARJ_MAGIC_2);
             final int basicHeaderSize = readSwappedUnsignedShort();
-            if (basicHeaderSize == 0) {
-                // end of archive
-                return null;
-            } else if (basicHeaderSize <= 2600) {
+            // At least two bytes are required for the null-terminated name 
and comment
+            // The value 2600 is taken from the reference implementation
+            if (MIN_FIRST_HEADER_SIZE + 2 <= basicHeaderSize && 
basicHeaderSize <= 2600) {
                 basicHeaderBytes = 
org.apache.commons.io.IOUtils.toByteArray(in, basicHeaderSize);
                 count(basicHeaderSize);
-                final long basicHeaderCrc32 = 
EndianUtils.readSwappedUnsignedInteger(in);
-                count(4);
-                final CRC32 crc32 = new CRC32();
-                crc32.update(basicHeaderBytes);
-                if (basicHeaderCrc32 == crc32.getValue()) {
+                if (checkCRC32(basicHeaderBytes)) {
                     return basicHeaderBytes;
                 }
             }
+            // CRC32 failed, continue scanning
         }
     }
 
+    private static int readUnsignedByte(InputStream in) throws IOException {
+        final int value = in.read();
+        if (value == -1) {
+            throw new EOFException();
+        }
+        return value & 0xff;
+    }
+
+    private int readSwappedUnsignedShort() throws IOException {
+        final int value = EndianUtils.readSwappedUnsignedShort(in);
+        count(2);
+        return value;
+    }
+
+    private long readSwappedUnsignedInteger() throws IOException {
+        final long value = EndianUtils.readSwappedUnsignedInteger(in);
+        count(4);
+        return value;
+    }
+
+    private int readUnsignedByte() throws IOException {
+        final int value = readUnsignedByte(in);
+        count(1);
+        return value & 0xff;
+    }
+
+    private byte[] readHeader() throws IOException {
+        final int first = readUnsignedByte();
+        final int second = readUnsignedByte();
+        if (first != ARJ_MAGIC_1 || second != ARJ_MAGIC_2) {
+            throw new ArchiveException("Corrupted ARJ archive: invalid ARJ 
header signature 0x%02X 0x%02X", first, second);
+        }
+        final int basicHeaderSize = readSwappedUnsignedShort();
+        if (basicHeaderSize == 0) {
+            // End of archive
+            return null;
+        }
+        // At least two bytes are required for the null-terminated name and 
comment
+        // The value 2600 is taken from the reference implementation
+        if (basicHeaderSize < MIN_FIRST_HEADER_SIZE + 2 || basicHeaderSize > 
2600) {
+            throw new ArchiveException("Corrupted ARJ archive: invalid ARJ 
header size %,d", basicHeaderSize);
+        }
+        final byte[] basicHeaderBytes = 
org.apache.commons.io.IOUtils.toByteArray(in, basicHeaderSize);
+        count(basicHeaderSize);
+        if (!checkCRC32(basicHeaderBytes)) {
+            throw new ArchiveException("Corrupted ARJ archive: invalid ARJ 
header CRC32 checksum");
+        }
+        return basicHeaderBytes;
+    }
+
     private LocalFileHeader readLocalFileHeader() throws IOException {
         final byte[] basicHeaderBytes = readHeader();
         if (basicHeaderBytes == null) {
@@ -325,8 +378,18 @@ private LocalFileHeader readLocalFileHeader() throws 
IOException {
                 localFileHeader.fileAccessMode = 
EndianUtils.readSwappedShort(firstHeader);
                 localFileHeader.firstChapter = readUnsignedByte(firstHeader);
                 localFileHeader.lastChapter = readUnsignedByte(firstHeader);
-
-                readExtraData(firstHeaderSize, firstHeader, localFileHeader);
+                // Total read (including size byte): 10 + 4 * 4 + 2 * 2 = 30 
bytes
+
+                if (firstHeaderSize >= MIN_FIRST_HEADER_SIZE + 4) {
+                    localFileHeader.extendedFilePosition = 
EndianUtils.readSwappedInteger(firstHeader);
+                    // Total read (including size byte): 30 + 4 = 34 bytes
+                    if (firstHeaderSize >= MIN_FIRST_HEADER_SIZE + 4 + 12) {
+                        localFileHeader.dateTimeAccessed = 
EndianUtils.readSwappedInteger(firstHeader);
+                        localFileHeader.dateTimeCreated = 
EndianUtils.readSwappedInteger(firstHeader);
+                        localFileHeader.originalSizeEvenForVolumes = 
EndianUtils.readSwappedInteger(firstHeader);
+                        // Total read (including size byte): 34 + 12 = 46 bytes
+                    }
+                }
             }
 
             localFileHeader.name = readString(basicHeader);
@@ -338,12 +401,8 @@ private LocalFileHeader readLocalFileHeader() throws 
IOException {
         while ((extendedHeaderSize = readSwappedUnsignedShort()) > 0) {
             final byte[] extendedHeaderBytes = 
org.apache.commons.io.IOUtils.toByteArray(in, extendedHeaderSize);
             count(extendedHeaderSize);
-            final long extendedHeaderCrc32 = 
EndianUtils.readSwappedUnsignedInteger(in);
-            count(4);
-            final CRC32 crc32 = new CRC32();
-            crc32.update(extendedHeaderBytes);
-            if (extendedHeaderCrc32 != crc32.getValue()) {
-                throw new ArchiveException("Extended header CRC32 verification 
failure");
+            if (!checkCRC32(extendedHeaderBytes)) {
+                throw new ArchiveException("Corrupted ARJ archive: extended 
header CRC32 verification failure");
             }
             extendedHeaders.add(extendedHeaderBytes);
         }
@@ -352,10 +411,10 @@ private LocalFileHeader readLocalFileHeader() throws 
IOException {
         return localFileHeader;
     }
 
-    private MainHeader readMainHeader() throws IOException {
+    private MainHeader readMainHeader(final boolean selfExtracting) throws 
IOException {
         final byte[] basicHeaderBytes;
         try {
-            basicHeaderBytes = readHeader();
+            basicHeaderBytes = selfExtracting ? findMainHeader() : 
readHeader();
         } catch (final EOFException e) {
             throw new ArchiveException("Archive ends without any headers", 
(Throwable) e);
         }
@@ -380,12 +439,14 @@ private MainHeader readMainHeader() throws IOException {
                 header.securityEnvelopeLength = 
EndianUtils.readSwappedShort(firstHeader);
                 header.encryptionVersion = readUnsignedByte(firstHeader);
                 header.lastChapter = readUnsignedByte(firstHeader);
+                // Total read (including size byte): 10 + 4 * 4 + 2 * 2 = 30 
bytes
 
-                if (firstHeaderSize >= 33) {
+                if (firstHeaderSize >= MIN_FIRST_HEADER_SIZE + 4) {
                     header.arjProtectionFactor = readUnsignedByte(firstHeader);
                     header.arjFlags2 = readUnsignedByte(firstHeader);
                     readUnsignedByte(firstHeader);
                     readUnsignedByte(firstHeader);
+                    // Total read (including size byte): 30 + 4 = 34 bytes
                 }
             }
 
@@ -397,12 +458,8 @@ private MainHeader readMainHeader() throws IOException {
         if (extendedHeaderSize > 0) {
             header.extendedHeaderBytes = 
org.apache.commons.io.IOUtils.toByteArray(in, extendedHeaderSize);
             count(extendedHeaderSize);
-            final long extendedHeaderCrc32 = 
EndianUtils.readSwappedUnsignedInteger(in);
-            count(4);
-            final CRC32 crc32 = new CRC32();
-            crc32.update(header.extendedHeaderBytes);
-            if (extendedHeaderCrc32 != crc32.getValue()) {
-                throw new ArchiveException("Extended header CRC32 verification 
failure");
+            if (!checkCRC32(header.extendedHeaderBytes)) {
+                throw new ArchiveException("Corrupted ARJ archive: extended 
header CRC32 verification failure");
             }
         }
 
diff --git 
a/src/test/java/org/apache/commons/compress/archivers/arj/ArjArchiveInputStreamTest.java
 
b/src/test/java/org/apache/commons/compress/archivers/arj/ArjArchiveInputStreamTest.java
index d8d5596ca..4da3595b6 100644
--- 
a/src/test/java/org/apache/commons/compress/archivers/arj/ArjArchiveInputStreamTest.java
+++ 
b/src/test/java/org/apache/commons/compress/archivers/arj/ArjArchiveInputStreamTest.java
@@ -24,27 +24,33 @@
 import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertInstanceOf;
 import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertNull;
 import static org.junit.jupiter.api.Assertions.assertThrows;
-import static org.junit.jupiter.api.Assertions.assertTrue;
-import static org.junit.jupiter.api.Assertions.fail;
 
+import java.io.ByteArrayInputStream;
 import java.io.EOFException;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.SequenceInputStream;
 import java.nio.charset.Charset;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.Calendar;
 import java.util.TimeZone;
+import java.util.stream.Stream;
+import java.util.zip.CRC32;
 
 import org.apache.commons.compress.AbstractTest;
 import org.apache.commons.compress.archivers.ArchiveException;
+import org.apache.commons.io.EndianUtils;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.input.BoundedInputStream;
 import org.apache.commons.io.output.ByteArrayOutputStream;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.MethodSource;
 import org.junit.jupiter.params.provider.ValueSource;
+import org.junitpioneer.jupiter.cartesian.CartesianTest;
 
 /**
  * Tests {@link ArjArchiveInputStream}.
@@ -87,17 +93,34 @@ private void assertForEach(final ArjArchiveInputStream 
archive) throws IOExcepti
         });
     }
 
-    @Test
-    void testFirstHeaderSizeSetToZero() {
-        final ArchiveException ex = assertThrows(ArchiveException.class, () -> 
{
-            try (ArjArchiveInputStream archive = 
ArjArchiveInputStream.builder()
-                    
.setURI(getURI("org/apache/commons/compress/arj/zero_sized_headers-fail.arj"))
-                    .get()) {
-                // Do nothing, ArchiveException already thrown
-                fail("ArchiveException not thrown.");
-            }
-        });
-        assertTrue(ex.getCause() instanceof IOException);
+    private static byte[] createArjArchiveHeader(int size, boolean computeCrc) 
{
+        // Enough space for the fixed-size portion of the header plus:
+        // - signature (2 bytes)
+        // - the 2-byte basic header size field itself (2 bytes)
+        // - at least one byte each for the filename and comment C-strings (2 
bytes)
+        // - the 4-byte CRC-32 that follows the basic header
+        final byte[] bytes = new byte[4 + size + 10];
+        bytes[0] = (byte) 0x60; // ARJ signature
+        bytes[1] = (byte) 0xEA;
+        // Basic header size (little-endian)
+        EndianUtils.writeSwappedShort(bytes, 2, (short) (size + 2));
+        // First header size
+        bytes[4] = (byte) size;
+        // Compute valid CRC-32 for the basic header
+        if (computeCrc) {
+            final CRC32 crc32 = new CRC32();
+            crc32.update(bytes, 4, size + 2);
+            EndianUtils.writeSwappedInteger(bytes, 4 + size + 2, (int) 
crc32.getValue());
+        }
+        return bytes;
+    }
+
+    @CartesianTest
+    void testSmallFirstHeaderSize(
+            // 30 is the minimum valid size
+            @CartesianTest.Values(ints = {0, 1, 10, 29}) int size, 
@CartesianTest.Values(booleans = {false, true}) boolean selfExtracting) {
+        final byte[] bytes = createArjArchiveHeader(size, true);
+        assertThrows(ArchiveException.class, () -> 
ArjArchiveInputStream.builder().setByteArray(bytes).setSelfExtracting(selfExtracting).get());
     }
 
     @Test
@@ -277,6 +300,35 @@ void testReadingOfAttributesUnixVersion() throws Exception 
{
         }
     }
 
+    static Stream<byte[]> testSelfExtractingArchive() {
+        return Stream.of(
+                new byte[] { 0x10, 0x11, 0x12, 0x13, 0x14 },
+                // In a normal context: an archive trailer.
+                new byte[] { 0x60, (byte) 0xEA, 0x00, 0x00 },
+                // Header of valid size, but with an invalid CRC-32.
+                createArjArchiveHeader(30, false)
+        );
+    }
+
+    @ParameterizedTest
+    @MethodSource
+    void testSelfExtractingArchive(byte[] junk) throws Exception {
+        final Path validArj = getPath("bla.arj");
+        try (InputStream first = new ByteArrayInputStream(junk);
+             InputStream second = Files.newInputStream(validArj);
+             SequenceInputStream seq = new SequenceInputStream(first, second);
+             ArjArchiveInputStream in = 
ArjArchiveInputStream.builder().setInputStream(seq).setSelfExtracting(true).get())
 {
+            ArjArchiveEntry entry = in.getNextEntry();
+            assertNotNull(entry);
+            assertEquals("test1.xml", entry.getName());
+            entry = in.getNextEntry();
+            assertNotNull(entry);
+            assertEquals("test2.xml", entry.getName());
+            entry = in.getNextEntry();
+            assertNull(entry);
+        }
+    }
+
     @Test
     void testSingleArgumentConstructor() throws Exception {
         try (InputStream inputStream = 
Files.newInputStream(getPath("bla.arj"));

(commons-compress) 01/01: ARJ: strict header validation and `selfExtracting` option

Reply via email to