Repository: tika Updated Branches: refs/heads/master f5b04b60c -> 71cb9363c
TIKA-2040 - prevent permanent hang/oom on corrupt chm file Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/71cb9363 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/71cb9363 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/71cb9363 Branch: refs/heads/master Commit: 71cb9363c07839e68712edde4626d53aa928cc2a Parents: f5b04b6 Author: tballison <talli...@mitre.org> Authored: Tue Jul 26 21:33:10 2016 -0400 Committer: tballison <talli...@mitre.org> Committed: Tue Jul 26 21:33:10 2016 -0400 ---------------------------------------------------------------------- CHANGES.txt | 2 ++ .../chm/accessor/ChmDirectoryListingSet.java | 11 +++++----- .../apache/tika/parser/chm/core/ChmCommons.java | 5 ++++- .../tika/parser/chm/core/ChmExtractor.java | 4 ++-- .../apache/tika/parser/chm/lzx/ChmLzxBlock.java | 4 ++-- .../tika/parser/chm/TestChmExtractor.java | 21 ++++++++++++++++--- .../resources/test-documents/testChm_oom.chm | Bin 0 -> 4315 bytes 7 files changed, 34 insertions(+), 13 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/71cb9363/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index 9dd40b5..a994600 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,5 +1,7 @@ Release 1.14 - ??? + * Prevent OOM/permanent hang on some corrupt CHM files (TIKA-2040). + * Upgrade ICU4J charset detection components to fix multithreading bug (TIKA-2041). http://git-wip-us.apache.org/repos/asf/tika/blob/71cb9363/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java b/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java index 9d0a2f0..e96426f 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java @@ -16,16 +16,17 @@ */ package org.apache.tika.parser.chm.accessor; +import static java.nio.charset.StandardCharsets.UTF_8; + import java.math.BigInteger; import java.util.ArrayList; import java.util.List; + import org.apache.tika.exception.TikaException; import org.apache.tika.parser.chm.core.ChmCommons; import org.apache.tika.parser.chm.core.ChmConstants; import org.apache.tika.parser.chm.exception.ChmParsingException; -import static java.nio.charset.StandardCharsets.UTF_8; - /** * Holds chm listing entries */ @@ -121,7 +122,7 @@ public class ChmDirectoryListingSet { * chm itsp PMGLheader */ private void enumerateChmDirectoryListingList(ChmItsfHeader chmItsHeader, - ChmItspHeader chmItspHeader) { + ChmItspHeader chmItspHeader) throws TikaException { try { int startPmgl = chmItspHeader.getIndex_head(); int stopPmgl = chmItspHeader.getUnknown_0024(); @@ -145,7 +146,7 @@ public class ChmDirectoryListingSet { i=PMGLheader.getBlockNext(); dir_chunk = null; } - } catch (Exception e) { + } catch (ChmParsingException e) { e.printStackTrace(); } finally { setData(null); @@ -196,7 +197,7 @@ public class ChmDirectoryListingSet { * * @param dir_chunk */ - private void enumerateOneSegment(byte[] dir_chunk) throws ChmParsingException { + private void enumerateOneSegment(byte[] dir_chunk) throws ChmParsingException, TikaException { // try { if (dir_chunk != null) { int header_len; http://git-wip-us.apache.org/repos/asf/tika/blob/71cb9363/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java b/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java index cded7f2..a9d2454 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java @@ -332,11 +332,14 @@ public class ChmCommons { /* * This method is added because of supporting of Java 5 */ - public static byte[] copyOfRange(byte[] original, int from, int to) { + public static byte[] copyOfRange(byte[] original, int from, int to) throws TikaException { checkCopyOfRangeParams(original, from, to); int newLength = to - from; if (newLength < 0) throw new IllegalArgumentException(from + " > " + to); + if (to > original.length) { + throw new TikaException("can't copy beyond array length"); + } byte[] copy = new byte[newLength]; System.arraycopy(original, from, copy, 0, Math.min(original.length - from, newLength)); return copy; http://git-wip-us.apache.org/repos/asf/tika/blob/71cb9363/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java index 454c1c4..c1e4495 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java @@ -16,6 +16,8 @@ */ package org.apache.tika.parser.chm.core; +import static java.nio.charset.StandardCharsets.UTF_8; + import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; @@ -35,8 +37,6 @@ import org.apache.tika.parser.chm.core.ChmCommons.EntryType; import org.apache.tika.parser.chm.lzx.ChmBlockInfo; import org.apache.tika.parser.chm.lzx.ChmLzxBlock; -import static java.nio.charset.StandardCharsets.UTF_8; - /** * Extracts text from chm file. Enumerates chm entries. */ http://git-wip-us.apache.org/repos/asf/tika/blob/71cb9363/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java b/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java index 9ca3595..b5ea37a 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java @@ -846,12 +846,12 @@ public class ChmLzxBlock { return content; } - public byte[] getContent(int startOffset, int endOffset) { + public byte[] getContent(int startOffset, int endOffset) throws TikaException { return (getContent() != null) ? ChmCommons.copyOfRange(getContent(), startOffset, endOffset) : new byte[1]; } - public byte[] getContent(int start) { + public byte[] getContent(int start) throws TikaException { return (getContent() != null) ? ChmCommons.copyOfRange(getContent(), start, getContent().length) : new byte[1]; } http://git-wip-us.apache.org/repos/asf/tika/blob/71cb9363/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java b/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java index c072db0..60d3e31 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java @@ -16,18 +16,23 @@ */ package org.apache.tika.parser.chm; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + import java.io.ByteArrayInputStream; import java.util.List; + +import org.apache.tika.TikaTest; import org.apache.tika.exception.TikaException; import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet; import org.apache.tika.parser.chm.accessor.DirectoryListingEntry; import org.apache.tika.parser.chm.core.ChmExtractor; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; import org.junit.Before; import org.junit.Test; -public class TestChmExtractor { +public class TestChmExtractor extends TikaTest { private ChmExtractor chmExtractor = null; @Before @@ -60,4 +65,14 @@ public class TestChmExtractor { assertEquals(TestParameters.VP_CHM_ENTITIES_NUMBER, count); } + @Test + public void testOOMOnCorruptCHM() throws Exception { + try { + XMLResult r = getXML("testChm_oom.chm"); + fail("should have thrown TikaException"); + } catch (TikaException e) { + assertTrue("correct exception thrown", true); + } + } + } http://git-wip-us.apache.org/repos/asf/tika/blob/71cb9363/tika-parsers/src/test/resources/test-documents/testChm_oom.chm ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/test-documents/testChm_oom.chm b/tika-parsers/src/test/resources/test-documents/testChm_oom.chm new file mode 100644 index 0000000..675485b Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testChm_oom.chm differ