This is an automated email from the ASF dual-hosted git repository. pkarwasz pushed a commit to branch feat/arj-strict-validation in repository https://gitbox.apache.org/repos/asf/commons-compress.git
commit 83fe7fd90e5a41cc8c2be6095422404ba33ce4fa Author: Piotr P. Karwasz <[email protected]> AuthorDate: Mon Oct 6 18:00:57 2025 +0200 ARJ: strict header validation and `selfExtracting` option Today `ArjArchiveInputStream` keeps scanning past invalid headers, assuming self-extracting stubs. That can hide corruption. This PR: * Introduces a `selfExtracting` ARJ archive option (default **false**). * **false:** no scanning; parse strictly from the first byte. Any invalid/truncated header fails fast. * **true:** scan only to locate the Main Archive Header (AMH), then switch to **strict mode**. All subsequent headers must be contiguous and valid. **Behavioral change** Previously, we might “skip over” bad data. Now we **only** allow a discovery scan for AMH (when opted in); everything after must validate or fail. --- src/changes/changes.xml | 1 + .../archivers/arj/ArjArchiveInputStream.java | 179 ++++++++++++++------- .../archivers/arj/ArjArchiveInputStreamTest.java | 78 +++++++-- 3 files changed, 184 insertions(+), 74 deletions(-) diff --git a/src/changes/changes.xml b/src/changes/changes.xml index 9e68ae23d..e19fb0b2f 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -81,6 +81,7 @@ The <action> type attribute can be add,update,fix,remove. <action type="fix" dev="pkarwasz" due-to="Tyler Nighswander">Simplify handling of special AR records in ArArchiveInputStream.</action> <!-- FIX arj --> <action type="fix" dev="pkarwasz" due-to="Piotr P. Karwasz">Correct byte accounting and truncation errors in ARJ input stream.</action> + <action type="fix" dev="pkarwasz" due-to="Piotr P. Karwasz">Add strict header validation in ARJ input stream and `selfExtracting` option.</action> <!-- FIX unpack200 --> <action type="fix" dev="ggregory" due-to="Gary Gregory, Stanislav Fort">org.apache.commons.compress.harmony.unpack200 now throws Pack200Exception, IllegalArgumentException, and IllegalStateException instead of other runtime exceptions and Error.</action> <!-- FIX pack200 --> diff --git a/src/main/java/org/apache/commons/compress/archivers/arj/ArjArchiveInputStream.java b/src/main/java/org/apache/commons/compress/archivers/arj/ArjArchiveInputStream.java index 7068231be..6c76575dd 100644 --- a/src/main/java/org/apache/commons/compress/archivers/arj/ArjArchiveInputStream.java +++ b/src/main/java/org/apache/commons/compress/archivers/arj/ArjArchiveInputStream.java @@ -62,10 +62,34 @@ public class ArjArchiveInputStream extends ArchiveInputStream<ArjArchiveEntry> { */ public static final class Builder extends AbstractArchiveBuilder<ArjArchiveInputStream, Builder> { + private boolean selfExtracting; + private Builder() { setCharset(ENCODING_NAME); } + /** + * Enables compatibility with self-extracting (SFX) ARJ files. + * + * <p>When {@code true}, the stream is scanned forward to locate the first + * valid ARJ main header. All bytes before that point are ignored, which + * allows reading ARJ data embedded in an executable stub.</p> + * + * <p><strong>Caveat:</strong> this lenient pre-scan can mask corruption that + * would otherwise be reported at the start of a normal {@code .arj} file. + * Enable only when you expect an SFX input.</p> + * + * <p>Default: {@code false}.</p> + * + * @param selfExtracting {@code true} if the input stream is for a self-extracting archive + * @return {@code this} instance + * @since 1.29.0 + */ + public Builder setSelfExtracting(final boolean selfExtracting) { + this.selfExtracting = selfExtracting; + return asThis(); + } + @Override public ArjArchiveInputStream get() throws IOException { return new ArjArchiveInputStream(this); @@ -75,6 +99,7 @@ public ArjArchiveInputStream get() throws IOException { private static final String ENCODING_NAME = "CP437"; private static final int ARJ_MAGIC_1 = 0x60; private static final int ARJ_MAGIC_2 = 0xEA; + private static final int MIN_FIRST_HEADER_SIZE = 30; /** * Creates a new builder. @@ -118,7 +143,7 @@ public ArjArchiveInputStream(final InputStream inputStream) throws ArchiveExcept private ArjArchiveInputStream(final InputStream inputStream, final Builder builder) throws ArchiveException { super(inputStream, builder); try { - mainHeader = readMainHeader(); + mainHeader = readMainHeader(builder.selfExtracting); if ((mainHeader.arjFlags & MainHeader.Flags.GARBLED) != 0) { throw new ArchiveException("Encrypted ARJ files are unsupported"); } @@ -231,49 +256,31 @@ public int read(final byte[] b, final int off, final int len) throws IOException return currentInputStream.read(b, off, len); } - private static int readUnsignedByte(InputStream in) throws IOException { - final int value = in.read(); - if (value == -1) { - throw new EOFException(); - } - return value & 0xff; - } - - private int readSwappedUnsignedShort() throws IOException { - final int value = EndianUtils.readSwappedUnsignedShort(in); - count(2); - return value; - } - - private int readUnsignedByte() throws IOException { - final int value = readUnsignedByte(in); - count(1); - return value & 0xff; - } - - private static void readExtraData(final int firstHeaderSize, final InputStream firstHeader, final LocalFileHeader localFileHeader) throws IOException { - if (firstHeaderSize >= 33) { - localFileHeader.extendedFilePosition = EndianUtils.readSwappedInteger(firstHeader); - if (firstHeaderSize >= 45) { - localFileHeader.dateTimeAccessed = EndianUtils.readSwappedInteger(firstHeader); - localFileHeader.dateTimeCreated = EndianUtils.readSwappedInteger(firstHeader); - localFileHeader.originalSizeEvenForVolumes = EndianUtils.readSwappedInteger(firstHeader); - } - } + /** + * Verifies the CRC32 checksum of the given data against the next four bytes read from the input stream. + * + * @param data The data to verify. + * @return true if the checksum matches, false otherwise. + * @throws EOFException If the end of the stream is reached before reading the checksum. + * @throws IOException If an I/O error occurs. + */ + @SuppressWarnings("Since15") + private boolean checkCRC32(final byte[] data) throws IOException { + final CRC32 crc32 = new CRC32(); + crc32.update(data); + final long expectedCrc32 = readSwappedUnsignedInteger(); + return crc32.getValue() == expectedCrc32; } /** * Scans for the next valid ARJ header. * - * @return The header bytes, or {@code null} if end of archive. + * @return The header bytes. * @throws EOFException If the end of the stream is reached before a valid header is found. * @throws IOException If an I/O error occurs. */ - private byte[] readHeader() throws IOException { + private byte[] findMainHeader() throws IOException { byte[] basicHeaderBytes; - // TODO: Explain why we are scanning for a valid ARJ header - // and don't throw, when an invalid/corrupted header is found, - // which might indicate a corrupted archive. while (true) { int first; int second = readUnsignedByte(); @@ -282,23 +289,69 @@ private byte[] readHeader() throws IOException { second = readUnsignedByte(); } while (first != ARJ_MAGIC_1 && second != ARJ_MAGIC_2); final int basicHeaderSize = readSwappedUnsignedShort(); - if (basicHeaderSize == 0) { - // end of archive - return null; - } else if (basicHeaderSize <= 2600) { + // At least two bytes are required for the null-terminated name and comment + // The value 2600 is taken from the reference implementation + if (MIN_FIRST_HEADER_SIZE + 2 <= basicHeaderSize && basicHeaderSize <= 2600) { basicHeaderBytes = org.apache.commons.io.IOUtils.toByteArray(in, basicHeaderSize); count(basicHeaderSize); - final long basicHeaderCrc32 = EndianUtils.readSwappedUnsignedInteger(in); - count(4); - final CRC32 crc32 = new CRC32(); - crc32.update(basicHeaderBytes); - if (basicHeaderCrc32 == crc32.getValue()) { + if (checkCRC32(basicHeaderBytes)) { return basicHeaderBytes; } } + // CRC32 failed, continue scanning } } + private static int readUnsignedByte(InputStream in) throws IOException { + final int value = in.read(); + if (value == -1) { + throw new EOFException(); + } + return value & 0xff; + } + + private int readSwappedUnsignedShort() throws IOException { + final int value = EndianUtils.readSwappedUnsignedShort(in); + count(2); + return value; + } + + private long readSwappedUnsignedInteger() throws IOException { + final long value = EndianUtils.readSwappedUnsignedInteger(in); + count(4); + return value; + } + + private int readUnsignedByte() throws IOException { + final int value = readUnsignedByte(in); + count(1); + return value & 0xff; + } + + private byte[] readHeader() throws IOException { + final int first = readUnsignedByte(); + final int second = readUnsignedByte(); + if (first != ARJ_MAGIC_1 || second != ARJ_MAGIC_2) { + throw new ArchiveException("Corrupted ARJ archive: invalid ARJ header signature 0x%02X 0x%02X", first, second); + } + final int basicHeaderSize = readSwappedUnsignedShort(); + if (basicHeaderSize == 0) { + // End of archive + return null; + } + // At least two bytes are required for the null-terminated name and comment + // The value 2600 is taken from the reference implementation + if (basicHeaderSize < MIN_FIRST_HEADER_SIZE + 2 || basicHeaderSize > 2600) { + throw new ArchiveException("Corrupted ARJ archive: invalid ARJ header size %,d", basicHeaderSize); + } + final byte[] basicHeaderBytes = org.apache.commons.io.IOUtils.toByteArray(in, basicHeaderSize); + count(basicHeaderSize); + if (!checkCRC32(basicHeaderBytes)) { + throw new ArchiveException("Corrupted ARJ archive: invalid ARJ header CRC32 checksum"); + } + return basicHeaderBytes; + } + private LocalFileHeader readLocalFileHeader() throws IOException { final byte[] basicHeaderBytes = readHeader(); if (basicHeaderBytes == null) { @@ -325,8 +378,18 @@ private LocalFileHeader readLocalFileHeader() throws IOException { localFileHeader.fileAccessMode = EndianUtils.readSwappedShort(firstHeader); localFileHeader.firstChapter = readUnsignedByte(firstHeader); localFileHeader.lastChapter = readUnsignedByte(firstHeader); - - readExtraData(firstHeaderSize, firstHeader, localFileHeader); + // Total read (including size byte): 10 + 4 * 4 + 2 * 2 = 30 bytes + + if (firstHeaderSize >= MIN_FIRST_HEADER_SIZE + 4) { + localFileHeader.extendedFilePosition = EndianUtils.readSwappedInteger(firstHeader); + // Total read (including size byte): 30 + 4 = 34 bytes + if (firstHeaderSize >= MIN_FIRST_HEADER_SIZE + 4 + 12) { + localFileHeader.dateTimeAccessed = EndianUtils.readSwappedInteger(firstHeader); + localFileHeader.dateTimeCreated = EndianUtils.readSwappedInteger(firstHeader); + localFileHeader.originalSizeEvenForVolumes = EndianUtils.readSwappedInteger(firstHeader); + // Total read (including size byte): 34 + 12 = 46 bytes + } + } } localFileHeader.name = readString(basicHeader); @@ -338,12 +401,8 @@ private LocalFileHeader readLocalFileHeader() throws IOException { while ((extendedHeaderSize = readSwappedUnsignedShort()) > 0) { final byte[] extendedHeaderBytes = org.apache.commons.io.IOUtils.toByteArray(in, extendedHeaderSize); count(extendedHeaderSize); - final long extendedHeaderCrc32 = EndianUtils.readSwappedUnsignedInteger(in); - count(4); - final CRC32 crc32 = new CRC32(); - crc32.update(extendedHeaderBytes); - if (extendedHeaderCrc32 != crc32.getValue()) { - throw new ArchiveException("Extended header CRC32 verification failure"); + if (!checkCRC32(extendedHeaderBytes)) { + throw new ArchiveException("Corrupted ARJ archive: extended header CRC32 verification failure"); } extendedHeaders.add(extendedHeaderBytes); } @@ -352,10 +411,10 @@ private LocalFileHeader readLocalFileHeader() throws IOException { return localFileHeader; } - private MainHeader readMainHeader() throws IOException { + private MainHeader readMainHeader(final boolean selfExtracting) throws IOException { final byte[] basicHeaderBytes; try { - basicHeaderBytes = readHeader(); + basicHeaderBytes = selfExtracting ? findMainHeader() : readHeader(); } catch (final EOFException e) { throw new ArchiveException("Archive ends without any headers", (Throwable) e); } @@ -380,12 +439,14 @@ private MainHeader readMainHeader() throws IOException { header.securityEnvelopeLength = EndianUtils.readSwappedShort(firstHeader); header.encryptionVersion = readUnsignedByte(firstHeader); header.lastChapter = readUnsignedByte(firstHeader); + // Total read (including size byte): 10 + 4 * 4 + 2 * 2 = 30 bytes - if (firstHeaderSize >= 33) { + if (firstHeaderSize >= MIN_FIRST_HEADER_SIZE + 4) { header.arjProtectionFactor = readUnsignedByte(firstHeader); header.arjFlags2 = readUnsignedByte(firstHeader); readUnsignedByte(firstHeader); readUnsignedByte(firstHeader); + // Total read (including size byte): 30 + 4 = 34 bytes } } @@ -397,12 +458,8 @@ private MainHeader readMainHeader() throws IOException { if (extendedHeaderSize > 0) { header.extendedHeaderBytes = org.apache.commons.io.IOUtils.toByteArray(in, extendedHeaderSize); count(extendedHeaderSize); - final long extendedHeaderCrc32 = EndianUtils.readSwappedUnsignedInteger(in); - count(4); - final CRC32 crc32 = new CRC32(); - crc32.update(header.extendedHeaderBytes); - if (extendedHeaderCrc32 != crc32.getValue()) { - throw new ArchiveException("Extended header CRC32 verification failure"); + if (!checkCRC32(header.extendedHeaderBytes)) { + throw new ArchiveException("Corrupted ARJ archive: extended header CRC32 verification failure"); } } diff --git a/src/test/java/org/apache/commons/compress/archivers/arj/ArjArchiveInputStreamTest.java b/src/test/java/org/apache/commons/compress/archivers/arj/ArjArchiveInputStreamTest.java index d8d5596ca..4da3595b6 100644 --- a/src/test/java/org/apache/commons/compress/archivers/arj/ArjArchiveInputStreamTest.java +++ b/src/test/java/org/apache/commons/compress/archivers/arj/ArjArchiveInputStreamTest.java @@ -24,27 +24,33 @@ import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertInstanceOf; import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertThrows; -import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.junit.jupiter.api.Assertions.fail; +import java.io.ByteArrayInputStream; import java.io.EOFException; import java.io.IOException; import java.io.InputStream; +import java.io.SequenceInputStream; import java.nio.charset.Charset; import java.nio.file.Files; import java.nio.file.Path; import java.util.Calendar; import java.util.TimeZone; +import java.util.stream.Stream; +import java.util.zip.CRC32; import org.apache.commons.compress.AbstractTest; import org.apache.commons.compress.archivers.ArchiveException; +import org.apache.commons.io.EndianUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.io.input.BoundedInputStream; import org.apache.commons.io.output.ByteArrayOutputStream; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; import org.junit.jupiter.params.provider.ValueSource; +import org.junitpioneer.jupiter.cartesian.CartesianTest; /** * Tests {@link ArjArchiveInputStream}. @@ -87,17 +93,34 @@ private void assertForEach(final ArjArchiveInputStream archive) throws IOExcepti }); } - @Test - void testFirstHeaderSizeSetToZero() { - final ArchiveException ex = assertThrows(ArchiveException.class, () -> { - try (ArjArchiveInputStream archive = ArjArchiveInputStream.builder() - .setURI(getURI("org/apache/commons/compress/arj/zero_sized_headers-fail.arj")) - .get()) { - // Do nothing, ArchiveException already thrown - fail("ArchiveException not thrown."); - } - }); - assertTrue(ex.getCause() instanceof IOException); + private static byte[] createArjArchiveHeader(int size, boolean computeCrc) { + // Enough space for the fixed-size portion of the header plus: + // - signature (2 bytes) + // - the 2-byte basic header size field itself (2 bytes) + // - at least one byte each for the filename and comment C-strings (2 bytes) + // - the 4-byte CRC-32 that follows the basic header + final byte[] bytes = new byte[4 + size + 10]; + bytes[0] = (byte) 0x60; // ARJ signature + bytes[1] = (byte) 0xEA; + // Basic header size (little-endian) + EndianUtils.writeSwappedShort(bytes, 2, (short) (size + 2)); + // First header size + bytes[4] = (byte) size; + // Compute valid CRC-32 for the basic header + if (computeCrc) { + final CRC32 crc32 = new CRC32(); + crc32.update(bytes, 4, size + 2); + EndianUtils.writeSwappedInteger(bytes, 4 + size + 2, (int) crc32.getValue()); + } + return bytes; + } + + @CartesianTest + void testSmallFirstHeaderSize( + // 30 is the minimum valid size + @CartesianTest.Values(ints = {0, 1, 10, 29}) int size, @CartesianTest.Values(booleans = {false, true}) boolean selfExtracting) { + final byte[] bytes = createArjArchiveHeader(size, true); + assertThrows(ArchiveException.class, () -> ArjArchiveInputStream.builder().setByteArray(bytes).setSelfExtracting(selfExtracting).get()); } @Test @@ -277,6 +300,35 @@ void testReadingOfAttributesUnixVersion() throws Exception { } } + static Stream<byte[]> testSelfExtractingArchive() { + return Stream.of( + new byte[] { 0x10, 0x11, 0x12, 0x13, 0x14 }, + // In a normal context: an archive trailer. + new byte[] { 0x60, (byte) 0xEA, 0x00, 0x00 }, + // Header of valid size, but with an invalid CRC-32. + createArjArchiveHeader(30, false) + ); + } + + @ParameterizedTest + @MethodSource + void testSelfExtractingArchive(byte[] junk) throws Exception { + final Path validArj = getPath("bla.arj"); + try (InputStream first = new ByteArrayInputStream(junk); + InputStream second = Files.newInputStream(validArj); + SequenceInputStream seq = new SequenceInputStream(first, second); + ArjArchiveInputStream in = ArjArchiveInputStream.builder().setInputStream(seq).setSelfExtracting(true).get()) { + ArjArchiveEntry entry = in.getNextEntry(); + assertNotNull(entry); + assertEquals("test1.xml", entry.getName()); + entry = in.getNextEntry(); + assertNotNull(entry); + assertEquals("test2.xml", entry.getName()); + entry = in.getNextEntry(); + assertNull(entry); + } + } + @Test void testSingleArgumentConstructor() throws Exception { try (InputStream inputStream = Files.newInputStream(getPath("bla.arj"));
