This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/tika.git
commit 75eea6e5502f4f5a2edf5ab459b4c369d33f66e5 Author: tballison <[email protected]> AuthorDate: Wed Apr 19 21:18:56 2017 -0400 TIKA-2330 -- prevent preventable ooms in both detecting and parsing corrupt files or files that are misidentified as compressed streams. --- .../tika/exception/TikaMemoryLimitException.java | 30 ++ tika-parent/pom.xml | 2 +- tika-parsers/pom.xml | 2 +- .../apache/tika/parser/pkg/CompressorParser.java | 48 +- .../parser/pkg/TikaCompressorStreamFactory.java | 551 +++++++++++++++++++++ .../tika/parser/pkg/ZipContainerDetector.java | 16 +- .../tika/detect/TestContainerAwareDetector.java | 10 + .../apache/tika/parser/pkg/CompressParserTest.java | 22 + .../src/test/resources/test-documents/testLZMA_oom | Bin 0 -> 19 bytes .../src/test/resources/test-documents/testZ_oom.Z | 1 + 10 files changed, 663 insertions(+), 19 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/exception/TikaMemoryLimitException.java b/tika-core/src/main/java/org/apache/tika/exception/TikaMemoryLimitException.java new file mode 100644 index 0000000..baf5818 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/exception/TikaMemoryLimitException.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.exception; + +/* + * Thrown when a parser is asked to allocate more memory than is allowable + * for a given threshold. For example, the ZCompressorInputStream might + * be asked to create an array many gigabytes of length by a corrupt file. + */ +public class TikaMemoryLimitException extends TikaException { + + public TikaMemoryLimitException(String msg) { + super(msg); + } +} diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml index 9220c3d..5e4b0dc 100644 --- a/tika-parent/pom.xml +++ b/tika-parent/pom.xml @@ -306,7 +306,7 @@ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.reporting.outputEncoding>${project.build.sourceEncoding}</project.reporting.outputEncoding> <!-- NOTE: sync tukaani version with commons-compress in tika-parsers --> - <commons.compress.version>1.12</commons.compress.version> + <commons.compress.version>1.13</commons.compress.version> <commons.io.version>2.5</commons.io.version> <slf4j.version>1.7.24</slf4j.version> </properties> diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml index 58ac745..7ff88c2 100644 --- a/tika-parsers/pom.xml +++ b/tika-parsers/pom.xml @@ -39,7 +39,7 @@ <!-- NOTE: sync codec version with POI --> <codec.version>1.10</codec.version> <!-- NOTE: sync tukaani version with commons-compress in tika-parent--> - <tukaani.version>1.5</tukaani.version> + <tukaani.version>1.6</tukaani.version> <mime4j.version>0.7.2</mime4j.version> <vorbis.version>0.8</vorbis.version> <pdfbox.version>2.0.5</pdfbox.version> diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java index f82db54..ff589e0 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java @@ -25,18 +25,20 @@ import java.util.Set; import org.apache.commons.compress.compressors.CompressorException; import org.apache.commons.compress.compressors.CompressorInputStream; -import org.apache.commons.compress.compressors.CompressorStreamFactory; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; import org.apache.commons.compress.compressors.deflate.DeflateCompressorInputStream; import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; import org.apache.commons.compress.compressors.gzip.GzipUtils; +import org.apache.commons.compress.compressors.lzma.LZMACompressorInputStream; import org.apache.commons.compress.compressors.pack200.Pack200CompressorInputStream; import org.apache.commons.compress.compressors.snappy.FramedSnappyCompressorInputStream; import org.apache.commons.compress.compressors.snappy.SnappyCompressorInputStream; import org.apache.commons.compress.compressors.xz.XZCompressorInputStream; import org.apache.commons.compress.compressors.z.ZCompressorInputStream; import org.apache.commons.io.input.CloseShieldInputStream; +import org.apache.tika.config.Field; import org.apache.tika.exception.TikaException; +import org.apache.tika.exception.TikaMemoryLimitException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.metadata.Metadata; @@ -64,13 +66,16 @@ public class CompressorParser extends AbstractParser { private static final MediaType PACK = MediaType.application("x-java-pack200"); private static final MediaType SNAPPY = MediaType.application("x-snappy-framed"); private static final MediaType ZLIB = MediaType.application("zlib"); + private static final MediaType LZMA = MediaType.application("x-lzma"); private static final Set<MediaType> SUPPORTED_TYPES = - MediaType.set(BZIP, BZIP2, GZIP, GZIP_ALT, COMPRESS, XZ, PACK, ZLIB); + MediaType.set(BZIP, BZIP2, GZIP, GZIP_ALT, COMPRESS, XZ, PACK, ZLIB, LZMA); + + private int memoryLimitInKb = 100000;//100MB static MediaType getMediaType(CompressorInputStream stream) { // TODO Add support for the remaining CompressorInputStream formats: - // LZMACompressorInputStream + // LZ4 // LZWInputStream -> UnshrinkingInputStream if (stream instanceof BZip2CompressorInputStream) { return BZIP2; @@ -88,6 +93,31 @@ public class CompressorParser extends AbstractParser { stream instanceof SnappyCompressorInputStream) { // TODO Add unit tests for this format return SNAPPY; + } else if (stream instanceof LZMACompressorInputStream) { + return LZMA; + } else { + return MediaType.OCTET_STREAM; + } + } + + static MediaType getMediaType(String name) { + if (TikaCompressorStreamFactory.BZIP2.equals(name)) { + return BZIP2; + } else if (TikaCompressorStreamFactory.GZIP.equals(name)) { + return GZIP; + } else if (TikaCompressorStreamFactory.XZ.equals(name)) { + return XZ; + } else if (TikaCompressorStreamFactory.DEFLATE.equals(name)) { + return ZLIB; + } else if (TikaCompressorStreamFactory.Z.equals(name)) { + return COMPRESS; + } else if (TikaCompressorStreamFactory.PACK200.equals(name)) { + return PACK; + } else if (TikaCompressorStreamFactory.SNAPPY_FRAMED.equals(name) || + TikaCompressorStreamFactory.SNAPPY_RAW.equals(name)) { + return SNAPPY; + } else if (TikaCompressorStreamFactory.LZMA.equals(name)) { + return LZMA; } else { return MediaType.OCTET_STREAM; } @@ -119,10 +149,13 @@ public class CompressorParser extends AbstractParser { return false; } }); - CompressorStreamFactory factory = - new CompressorStreamFactory(options.decompressConcatenated(metadata)); + TikaCompressorStreamFactory factory = + new TikaCompressorStreamFactory(options.decompressConcatenated(metadata), memoryLimitInKb); cis = factory.createCompressorInputStream(stream); } catch (CompressorException e) { + if (e.getMessage() != null && e.getMessage().startsWith("MemoryLimitException:")) { + throw new TikaMemoryLimitException(e.getMessage()); + } throw new TikaException("Unable to uncompress document stream", e); } @@ -171,4 +204,9 @@ public class CompressorParser extends AbstractParser { xhtml.endDocument(); } + @Field + public void setMemoryLimitInKb(int memoryLimitInKb) { + this.memoryLimitInKb = memoryLimitInKb; + } + } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/TikaCompressorStreamFactory.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/TikaCompressorStreamFactory.java new file mode 100644 index 0000000..a1a8405 --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/TikaCompressorStreamFactory.java @@ -0,0 +1,551 @@ +package org.apache.tika.parser.pkg; + /* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.security.AccessController; +import java.security.PrivilegedAction; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.Locale; +import java.util.Set; +import java.util.SortedMap; +import java.util.TreeMap; + +import org.apache.commons.compress.compressors.CompressorException; +import org.apache.commons.compress.compressors.CompressorInputStream; +import org.apache.commons.compress.compressors.CompressorOutputStream; +import org.apache.commons.compress.compressors.CompressorStreamProvider; +import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; +import org.apache.commons.compress.compressors.deflate.DeflateCompressorInputStream; +import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; +import org.apache.commons.compress.compressors.lzma.LZMAUtils; +import org.apache.commons.compress.compressors.pack200.Pack200CompressorInputStream; +import org.apache.commons.compress.compressors.snappy.FramedSnappyCompressorInputStream; +import org.apache.commons.compress.compressors.snappy.SnappyCompressorInputStream; +import org.apache.commons.compress.compressors.xz.XZCompressorInputStream; +import org.apache.commons.compress.compressors.xz.XZUtils; +import org.apache.commons.compress.compressors.z.ZCompressorInputStream; +import org.apache.commons.compress.utils.IOUtils; +import org.apache.commons.compress.utils.Lists; +import org.apache.commons.compress.utils.ServiceLoaderIterator; +import org.apache.commons.compress.utils.Sets; +import org.apache.tika.exception.TikaMemoryLimitException; +import org.tukaani.xz.LZMAInputStream; +import org.tukaani.xz.MemoryLimitException; + +/** + * This is a temporary copy/paste hack from commons-compress for Tika 1.15 + * that 1) allows detection without initialization of a stream and + * 2) prevents easily preventable OOM on two file formats. + * + * Once commons-compress 1.14 is released, we will delete this class + * and go back to commons-compress's CompressorStreamFactory. + */ +@Deprecated +class TikaCompressorStreamFactory implements CompressorStreamProvider { + + + + private static final TikaCompressorStreamFactory SINGLETON = new TikaCompressorStreamFactory(true, -1); + + /** + * Constant (value {@value}) used to identify the BZIP2 compression + * algorithm. + * + * @since 1.1 + */ + public static final String BZIP2 = "bzip2"; + + /** + * Constant (value {@value}) used to identify the GZIP compression + * algorithm. + * + * @since 1.1 + */ + public static final String GZIP = "gz"; + + /** + * Constant (value {@value}) used to identify the PACK200 compression + * algorithm. + * + * @since 1.3 + */ + public static final String PACK200 = "pack200"; + + /** + * Constant (value {@value}) used to identify the XZ compression method. + * + * @since 1.4 + */ + public static final String XZ = "xz"; + + /** + * Constant (value {@value}) used to identify the LZMA compression method. + * + * @since 1.6 + */ + public static final String LZMA = "lzma"; + + /** + * Constant (value {@value}) used to identify the "framed" Snappy + * compression method. + * + * @since 1.7 + */ + public static final String SNAPPY_FRAMED = "snappy-framed"; + + /** + * Constant (value {@value}) used to identify the "raw" Snappy compression + * method. Not supported as an output stream type. + * + * @since 1.7 + */ + public static final String SNAPPY_RAW = "snappy-raw"; + + /** + * Constant (value {@value}) used to identify the traditional Unix compress + * method. Not supported as an output stream type. + * + * @since 1.7 + */ + public static final String Z = "z"; + + /** + * Constant (value {@value}) used to identify the Deflate compress method. + * + * @since 1.9 + */ + public static final String DEFLATE = "deflate"; + + + private final int memoryLimitInKb; + + private SortedMap<String, CompressorStreamProvider> compressorInputStreamProviders; + + + public static String getBzip2() { + return BZIP2; + } + + public static String getDeflate() { + return DEFLATE; + } + + public static String getGzip() { + return GZIP; + } + + public static String getLzma() { + return LZMA; + } + + public static String getPack200() { + return PACK200; + } + + public static TikaCompressorStreamFactory getSingleton() { + return SINGLETON; + } + + public static String getSnappyFramed() { + return SNAPPY_FRAMED; + } + + public static String getSnappyRaw() { + return SNAPPY_RAW; + } + + public static String getXz() { + return XZ; + } + + public static String getZ() { + return Z; + } + + static void putAll(final Set<String> names, final CompressorStreamProvider provider, + final TreeMap<String, CompressorStreamProvider> map) { + for (final String name : names) { + map.put(toKey(name), provider); + } + } + + private static String toKey(final String name) { + return name.toUpperCase(Locale.ROOT); + } + + /** + * If true, decompress until the end of the input. If false, stop after the + * first stream and leave the input position to point to the next byte after + * the stream + */ + private final Boolean decompressUntilEOF; + + /** + * If true, decompress until the end of the input. If false, stop after the + * first stream and leave the input position to point to the next byte after + * the stream + */ + private volatile boolean decompressConcatenated = false; + + /** + * Create an instance with the provided decompress Concatenated option. + * + * @param decompressUntilEOF + * if true, decompress until the end of the input; if false, stop + * after the first stream and leave the input position to point + * to the next byte after the stream. This setting applies to the + * gzip, bzip2 and xz formats only. + * @since 1.10 + */ + public TikaCompressorStreamFactory(final boolean decompressUntilEOF, final int memoryLimitInKb) { + this.decompressUntilEOF = Boolean.valueOf(decompressUntilEOF); + // Also copy to existing variable so can continue to use that as the + // current value + this.decompressConcatenated = decompressUntilEOF; + this.memoryLimitInKb = memoryLimitInKb; + } + + /** + * Try to detect the type of compressor stream. + * + * @param in input stream + * @return type of compressor stream detected + * @throws CompressorException if no compressor stream type was detected + * or if something else went wrong + * @throws IllegalArgumentException if stream is null or does not support mark + * + * @since 1.14 + */ + public static String detect(final InputStream in) throws CompressorException { + if (in == null) { + throw new IllegalArgumentException("Stream must not be null."); + } + + if (!in.markSupported()) { + throw new IllegalArgumentException("Mark is not supported."); + } + + final byte[] signature = new byte[12]; + in.mark(signature.length); + int signatureLength = -1; + try { + signatureLength = IOUtils.readFully(in, signature); + in.reset(); + } catch (IOException e) { + throw new CompressorException("IOException while reading signature.", e); + } + + if (BZip2CompressorInputStream.matches(signature, signatureLength)) { + return BZIP2; + } + + if (GzipCompressorInputStream.matches(signature, signatureLength)) { + return GZIP; + } + + if (Pack200CompressorInputStream.matches(signature, signatureLength)) { + return PACK200; + } + + if (FramedSnappyCompressorInputStream.matches(signature, signatureLength)) { + return SNAPPY_FRAMED; + } + + if (ZCompressorInputStream.matches(signature, signatureLength)) { + return Z; + } + + if (DeflateCompressorInputStream.matches(signature, signatureLength)) { + return DEFLATE; + } + + if (XZUtils.matches(signature, signatureLength)) { + return XZ; + } + + if (LZMAUtils.matches(signature, signatureLength)) { + return LZMA; + } + +/* if (FramedLZ4CompressorInputStream.matches(signature, signatureLength)) { + return LZ4_FRAMED; + }*/ + + throw new CompressorException("No Compressor found for the stream signature."); + } + + public SortedMap<String, CompressorStreamProvider> getCompressorInputStreamProviders() { + if (compressorInputStreamProviders == null) { + compressorInputStreamProviders = Collections + .unmodifiableSortedMap(findAvailableCompressorInputStreamProviders()); + } + return compressorInputStreamProviders; + } + + public static SortedMap<String, CompressorStreamProvider> findAvailableCompressorInputStreamProviders() { + return AccessController.doPrivileged(new PrivilegedAction<SortedMap<String, CompressorStreamProvider>>() { + @Override + public SortedMap<String, CompressorStreamProvider> run() { + final TreeMap<String, CompressorStreamProvider> map = new TreeMap<>(); + putAll(SINGLETON.getInputStreamCompressorNames(), SINGLETON, map); + for (final CompressorStreamProvider provider : findCompressorStreamProviders()) { + putAll(provider.getInputStreamCompressorNames(), provider, map); + } + return map; + } + }); + } + + private static ArrayList<CompressorStreamProvider> findCompressorStreamProviders() { + return Lists.newArrayList(serviceLoaderIterator()); + } + + private static Iterator<CompressorStreamProvider> serviceLoaderIterator() { + return new ServiceLoaderIterator<>(CompressorStreamProvider.class); + } + + /** + * Create an compressor input stream from an input stream, autodetecting the + * compressor type from the first few bytes of the stream. The InputStream + * must support marks, like BufferedInputStream. + * + * @param in + * the input stream + * @return the compressor input stream + * @throws CompressorException + * if the compressor name is not known + * @throws IllegalArgumentException + * if the stream is null or does not support mark + * @since 1.1 + */ + public CompressorInputStream createCompressorInputStream(final InputStream in) throws CompressorException, + TikaMemoryLimitException { + return createCompressorInputStream(detect(in), in); + } + + /** + * Creates a compressor input stream from a compressor name and an input + * stream. + * + * @param name + * of the compressor, i.e. {@value #GZIP}, {@value #BZIP2}, + * {@value #XZ}, {@value #LZMA}, {@value #PACK200}, + * {@value #SNAPPY_RAW}, {@value #SNAPPY_FRAMED}, {@value #Z}, + * or {@value #DEFLATE} + * @param in + * the input stream + * @return compressor input stream + * @throws CompressorException + * if the compressor name is not known or not available + * @throws IllegalArgumentException + * if the name or input stream is null + */ + public CompressorInputStream createCompressorInputStream(final String name, final InputStream in) + throws CompressorException, TikaMemoryLimitException { + return createCompressorInputStream(name, in, decompressConcatenated); + } + + public CompressorInputStream createCompressorInputStream(final String name, final InputStream in, + final boolean actualDecompressConcatenated) throws CompressorException { + if (name == null || in == null) { + throw new IllegalArgumentException("Compressor name and stream must not be null."); + } + + try { + + if (GZIP.equalsIgnoreCase(name)) { + return new GzipCompressorInputStream(in, actualDecompressConcatenated); + } + + if (BZIP2.equalsIgnoreCase(name)) { + return new BZip2CompressorInputStream(in, actualDecompressConcatenated); + } + + if (XZ.equalsIgnoreCase(name)) { + if (!XZUtils.isXZCompressionAvailable()) { + throw new CompressorException("XZ compression is not available."); + } + return new XZCompressorInputStream(in, actualDecompressConcatenated); + } + + if (LZMA.equalsIgnoreCase(name)) { + if (!LZMAUtils.isLZMACompressionAvailable()) { + throw new CompressorException("LZMA compression is not available"); + } + try { + return new SaferLZMACompressorInputStream(in); + } catch (MemoryLimitException e) { + throw new CompressorException("MemoryLimitException: " + e.getMessage(), e); + } + } + + if (PACK200.equalsIgnoreCase(name)) { + return new Pack200CompressorInputStream(in); + } + + if (SNAPPY_RAW.equalsIgnoreCase(name)) { + return new SnappyCompressorInputStream(in); + } + + if (SNAPPY_FRAMED.equalsIgnoreCase(name)) { + return new FramedSnappyCompressorInputStream(in); + } + + if (Z.equalsIgnoreCase(name)) { + try { + return new SaferZCompressorInputStream(in); + } catch (TikaRuntimeMemoryLimitException e) { + throw new CompressorException("MemoryLimitException: " + e.getMessage(), e); + } + } + + if (DEFLATE.equalsIgnoreCase(name)) { + return new DeflateCompressorInputStream(in); + } +/* +not currently supported + if (LZ4_BLOCK.equalsIgnoreCase(name)) { + return new BlockLZ4CompressorInputStream(in); + } + + if (LZ4_FRAMED.equalsIgnoreCase(name)) { + return new FramedLZ4CompressorInputStream(in, actualDecompressConcatenated); + } + */ + + } catch (final IOException e) { + throw new CompressorException("Could not create CompressorInputStream.", e); + } + + final CompressorStreamProvider compressorStreamProvider = getCompressorInputStreamProviders().get(toKey(name)); + if (compressorStreamProvider != null) { + return compressorStreamProvider.createCompressorInputStream(name, in, actualDecompressConcatenated); + } + + throw new CompressorException("Compressor: " + name + " not found."); + } + + @Override + public CompressorOutputStream createCompressorOutputStream(String s, OutputStream outputStream) throws CompressorException { + throw new UnsupportedOperationException(); + } + + + // For Unit tests + boolean getDecompressConcatenated() { + return decompressConcatenated; + } + + public Set<String> getInputStreamCompressorNames() { + return Sets.newHashSet(GZIP, BZIP2, XZ, LZMA, PACK200, DEFLATE, SNAPPY_RAW, SNAPPY_FRAMED, Z); + } + + @Override + public Set<String> getOutputStreamCompressorNames() { + throw new UnsupportedOperationException(); + } + + public Boolean getDecompressUntilEOF() { + return decompressUntilEOF; + } + + private class SaferZCompressorInputStream extends ZCompressorInputStream { + + public SaferZCompressorInputStream(InputStream inputStream) throws IOException { + super(inputStream); + } + + @Override + protected void initializeTables(int maxCodeSize) { + int maxTableSize = 1 << maxCodeSize; + if (memoryLimitInKb > -1 && maxTableSize > (memoryLimitInKb*1024)) { + throw new TikaRuntimeMemoryLimitException("Calculated maxCodeSize ("+maxCodeSize+" bytes) is greater "+ + "than the maximum allowable ("+ (memoryLimitInKb*1024) +" bytes).\n"+ + "If the file is not corrupt, consider increasing " + + "the memoryLimitInKb parameter in the CompressorParser"); + } + super.initializeTables(maxCodeSize); + } + } + + private static class TikaRuntimeMemoryLimitException extends RuntimeException { + public TikaRuntimeMemoryLimitException(String msg) { + super(msg); + } + } + + private class SaferLZMACompressorInputStream extends CompressorInputStream { + private final InputStream in; + + /** + * Creates a new input stream that decompresses LZMA-compressed data + * from the specified input stream. + * + * @param inputStream where to read the compressed data + * + * @throws IOException if the input is not in the .lzma format, + * the input is corrupt or truncated, the .lzma + * headers specify sizes that are not supported + * by this implementation, or the underlying + * <code>inputStream</code> throws an exception + */ + public SaferLZMACompressorInputStream(final InputStream inputStream) throws IOException { + in = new LZMAInputStream(inputStream, memoryLimitInKb); + } + + /** {@inheritDoc} */ + @Override + public int read() throws IOException { + final int ret = in.read(); + count(ret == -1 ? 0 : 1); + return ret; + } + + /** {@inheritDoc} */ + @Override + public int read(final byte[] buf, final int off, final int len) throws IOException { + final int ret = in.read(buf, off, len); + count(ret); + return ret; + } + + /** {@inheritDoc} */ + @Override + public long skip(final long n) throws IOException { + return in.skip(n); + } + + /** {@inheritDoc} */ + @Override + public int available() throws IOException { + return in.available(); + } + + /** {@inheritDoc} */ + @Override + public void close() throws IOException { + in.close(); + } + } +} diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java index d43a17c..2434d1a 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java @@ -16,6 +16,8 @@ */ package org.apache.tika.parser.pkg; +import static java.nio.charset.StandardCharsets.UTF_8; + import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; @@ -33,8 +35,6 @@ import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; import org.apache.commons.compress.archivers.zip.ZipFile; import org.apache.commons.compress.compressors.CompressorException; -import org.apache.commons.compress.compressors.CompressorInputStream; -import org.apache.commons.compress.compressors.CompressorStreamFactory; import org.apache.commons.io.IOUtils; import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.poi.openxml4j.opc.OPCPackage; @@ -52,8 +52,6 @@ import org.apache.tika.parser.iwork.IWorkPackageParser; import org.apache.tika.parser.iwork.IWorkPackageParser.IWORKDocumentType; import org.apache.tika.parser.iwork.iwana.IWork13PackageParser; -import static java.nio.charset.StandardCharsets.UTF_8; - /** * A detector that works on Zip documents and other archive and compression * formats to figure out exactly what the file is. @@ -105,14 +103,8 @@ public class ZipContainerDetector implements Detector { private static MediaType detectCompressorFormat(byte[] prefix, int length) { try { - CompressorStreamFactory factory = new CompressorStreamFactory(); - CompressorInputStream cis = factory.createCompressorInputStream( - new ByteArrayInputStream(prefix, 0, length)); - try { - return CompressorParser.getMediaType(cis); - } finally { - IOUtils.closeQuietly(cis); - } + String type = TikaCompressorStreamFactory.detect(new ByteArrayInputStream(prefix, 0, length)); + return CompressorParser.getMediaType(type); } catch (CompressorException e) { return MediaType.OCTET_STREAM; } diff --git a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java index 89ff371..9cff7c4 100644 --- a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java +++ b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java @@ -361,6 +361,16 @@ public class TestContainerAwareDetector { "application/java-archive", "application/java-archive"); } + @Test + public void testLZMAOOM() throws Exception { + assertTypeByData("testLZMA_oom", "application/x-lzma"); + } + + @Test + public void testCompressOOM() throws Exception { + assertTypeByData("testZ_oom.Z", "application/x-compress"); + } + private TikaInputStream getTruncatedFile(String name, int n) throws IOException { try (InputStream input = TestContainerAwareDetector.class.getResourceAsStream( diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressParserTest.java index b80de07..7a006a9 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressParserTest.java @@ -18,9 +18,11 @@ package org.apache.tika.parser.pkg; import static java.nio.charset.StandardCharsets.US_ASCII; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; import java.io.InputStream; +import org.apache.tika.exception.TikaMemoryLimitException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.Parser; @@ -98,4 +100,24 @@ public class CompressParserTest extends AbstractPkgTest { // Tar file starts with the directory name assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, US_ASCII)); } + + @Test + public void testLZMAOOM() throws Exception { + try { + XMLResult r = getXML("testLZMA_oom"); + fail("should have thrown TikaMemoryLimitException"); + } catch (TikaMemoryLimitException e) { + } + } + + @Test + public void testCompressOOM() throws Exception { + try { + XMLResult r = getXML("testZ_oom.Z"); + fail("should have thrown TikaMemoryLimitException"); + } catch (TikaMemoryLimitException e) { + } + } + + } \ No newline at end of file diff --git a/tika-parsers/src/test/resources/test-documents/testLZMA_oom b/tika-parsers/src/test/resources/test-documents/testLZMA_oom new file mode 100644 index 0000000..be257f2 Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testLZMA_oom differ diff --git a/tika-parsers/src/test/resources/test-documents/testZ_oom.Z b/tika-parsers/src/test/resources/test-documents/testZ_oom.Z new file mode 100644 index 0000000..36d7f52 --- /dev/null +++ b/tika-parsers/src/test/resources/test-documents/testZ_oom.Z @@ -0,0 +1 @@ +�B \ No newline at end of file -- To stop receiving notification emails like this one, please contact "[email protected]" <[email protected]>.
