This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/tika.git
commit 9e89b442bd2b211c328eb563e42ed902f9e0ae6e Author: tballison <[email protected]> AuthorDate: Wed Apr 19 21:19:46 2017 -0400 TIKA-2331 -- Upgrade RTFParser to use new TikaMemoryLimitException --- .../apache/tika/parser/rtf/RTFEmbObjHandler.java | 14 +++++++--- .../java/org/apache/tika/parser/rtf/RTFParser.java | 30 +++++++++++++++++++++- .../org/apache/tika/parser/rtf/TextExtractor.java | 6 ++--- .../org/apache/tika/parser/rtf/RTFParserTest.java | 15 +++++++++++ .../org/apache/tika/parser/rtf/tika-config.xml | 26 +++++++++++++++++++ 5 files changed, 83 insertions(+), 8 deletions(-) diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java index 5e2ab25..42900fc 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java @@ -24,6 +24,7 @@ import java.util.concurrent.atomic.AtomicInteger; import org.apache.commons.io.FilenameUtils; import org.apache.commons.io.IOUtils; import org.apache.tika.exception.TikaException; +import org.apache.tika.exception.TikaMemoryLimitException; import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -70,11 +71,13 @@ class RTFEmbObjHandler { private StringBuilder sb = new StringBuilder(); private Metadata metadata; private EMB_STATE state = EMB_STATE.NADA; + private final int memoryLimitInKb; - protected RTFEmbObjHandler(ContentHandler handler, Metadata metadata, ParseContext context) { + protected RTFEmbObjHandler(ContentHandler handler, Metadata metadata, ParseContext context, int memoryLimitInKb) { this.handler = handler; this.embeddedDocumentUtil = new EmbeddedDocumentUtil(context); os = new ByteArrayOutputStream(); + this.memoryLimitInKb = memoryLimitInKb; } protected void startPict() { @@ -145,8 +148,13 @@ class RTFEmbObjHandler { } protected void writeBytes(InputStream is, int len) throws IOException, TikaException { - if (len < 0 || len > RTFParser.getMaxBytesForEmbeddedObject()) { - throw new IOException("length of bytes to read out of bounds: " + len); + if (len < 0) { + throw new TikaException("Requesting I read < 0 bytes ?!"); + } + if (len > memoryLimitInKb) { + throw new TikaMemoryLimitException("File embedded in RTF caused this (" + len + + ") bytes), but maximum allowed is ("+memoryLimitInKb+")."+ + "If this is a valid RTF file, consider increasing the memory limit via TikaConfig."); } byte[] bytes = new byte[len]; diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java index d2c448b..567a7a8 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java @@ -22,6 +22,7 @@ import java.util.Collections; import java.util.Set; import org.apache.commons.io.input.TaggedInputStream; +import org.apache.tika.config.Field; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; @@ -53,6 +54,7 @@ public class RTFParser extends AbstractParser { * * @return maximum number of bytes allowed for an embedded object. */ + @Deprecated public static int getMaxBytesForEmbeddedObject() { return EMB_OBJ_MAX_BYTES; } @@ -65,15 +67,24 @@ public class RTFParser extends AbstractParser { * * @param max maximum number of bytes to allow for embedded objects. If * the embedded object has more than this number of bytes, skip it. + * @deprecated use {@link #setMemoryLimitInKb(int)} instead */ + @Deprecated public static void setMaxBytesForEmbeddedObject(int max) { EMB_OBJ_MAX_BYTES = max; + USE_STATIC = true; } + //get rid of this once we get rid of the other static maxbytes... + private static volatile boolean USE_STATIC = false; + public Set<MediaType> getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; } + @Field + private int memoryLimitInKb = EMB_OBJ_MAX_BYTES; + public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) @@ -82,7 +93,7 @@ public class RTFParser extends AbstractParser { TaggedInputStream tagged = new TaggedInputStream(stream); try { XHTMLContentHandler xhtmlHandler = new XHTMLContentHandler(handler, metadata); - RTFEmbObjHandler embObjHandler = new RTFEmbObjHandler(xhtmlHandler, metadata, context); + RTFEmbObjHandler embObjHandler = new RTFEmbObjHandler(xhtmlHandler, metadata, context, getMemoryLimitInKb()); final TextExtractor ert = new TextExtractor(xhtmlHandler, metadata, embObjHandler); ert.extract(stream); } catch (IOException e) { @@ -90,4 +101,21 @@ public class RTFParser extends AbstractParser { throw new TikaException("Error parsing an RTF document", e); } } + + @Field + public void setMemoryLimitInKb(int memoryLimitInKb) { + this.memoryLimitInKb = memoryLimitInKb; + USE_STATIC = false; + } + + private int getMemoryLimitInKb() { + //there's a race condition here, but it shouldn't matter. + if (USE_STATIC) { + if (EMB_OBJ_MAX_BYTES < 0) { + return EMB_OBJ_MAX_BYTES; + } + return EMB_OBJ_MAX_BYTES/1024; + } + return memoryLimitInKb; + } } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java index 8ba8961..b07a3a0 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java @@ -947,10 +947,8 @@ final class TextExtractor { if (groupState.pictDepth == 1) { try { embObjHandler.writeBytes(in, param); - } catch (IOException e) { - //param was out of bounds or something went wrong during writing. - //skip this obj and move on - //TODO: log.warn + } catch (IOException|TikaException e) { + EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata); embObjHandler.reset(); } } else { diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java index b957b8c..aed6cf5 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java @@ -35,6 +35,7 @@ import java.util.Set; import org.apache.commons.io.FilenameUtils; import org.apache.tika.Tika; import org.apache.tika.TikaTest; +import org.apache.tika.config.TikaConfig; import org.apache.tika.extractor.ContainerExtractor; import org.apache.tika.extractor.ParserContainerExtractor; import org.apache.tika.io.TikaInputStream; @@ -524,6 +525,20 @@ public class RTFParserTest extends TikaTest { assertEquals(2, tracker.filenames.size()); } + @Test + public void testConfig() throws Exception { + //test that memory allocation of the bin element is limited + //via the config file. Unfortunately, this test file's bin embedding contains 10 bytes + //so we had to set the config to 0. + InputStream is = getClass().getResourceAsStream("/org/apache/tika/parser/rtf/tika-config.xml"); + assertNotNull(is); + TikaConfig tikaConfig = new TikaConfig(is); + Parser p = new AutoDetectParser(tikaConfig); + List<Metadata> metadataList = getRecursiveMetadata("testBinControlWord.rtf", p); + assertEquals(1, metadataList.size()); + assertContains("TikaMemoryLimitException", metadataList.get(0).get(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM)); + } + private Result getResult(String filename) throws Exception { File file = getResourceAsFile("/test-documents/" + filename); diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/rtf/tika-config.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/rtf/tika-config.xml new file mode 100644 index 0000000..1f53a78 --- /dev/null +++ b/tika-parsers/src/test/resources/org/apache/tika/parser/rtf/tika-config.xml @@ -0,0 +1,26 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <parsers> + <parser class="org.apache.tika.parser.rtf.RTFParser"> + <params> + <param name="memoryLimitInKb" type="int">0</param> + </params> + </parser> + </parsers> +</properties> -- To stop receiving notification emails like this one, please contact "[email protected]" <[email protected]>.
