Author: tallison
Date: Tue Apr 21 17:25:47 2015
New Revision: 1675159
URL: http://svn.apache.org/r1675159
Log:
TIKA-1611 -- allow RecursiveParserWrapper to catch exceptions caused by
embedded documents
Added:
tika/trunk/tika-core/src/main/java/org/apache/tika/utils/ExceptionUtils.java
tika/trunk/tika-parsers/src/test/resources/test-documents/test_recursive_embedded_npe.docx
(with props)
Removed:
tika/trunk/tika-batch/src/main/java/org/apache/tika/util/TikaExceptionFilter.java
tika/trunk/tika-batch/src/test/java/org/apache/tika/util/
Modified:
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
tika/trunk/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
Modified:
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java?rev=1675159&r1=1675158&r2=1675159&view=diff
==============================================================================
---
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java
(original)
+++
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java
Tue Apr 21 17:25:47 2015
@@ -37,7 +37,7 @@ import org.apache.tika.parser.ParseConte
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.sax.ContentHandlerFactory;
-import org.apache.tika.util.TikaExceptionFilter;
+import org.apache.tika.utils.ExceptionUtils;
import org.xml.sax.helpers.DefaultHandler;
/**
@@ -55,8 +55,6 @@ public class RecursiveParserWrapperFSCon
private final OutputStreamFactory fsOSFactory;
private final TikaConfig tikaConfig;
private String outputEncoding = "UTF-8";
- //TODO: parameterize this
- private TikaExceptionFilter exceptionFilter = new TikaExceptionFilter();
public RecursiveParserWrapperFSConsumer(ArrayBlockingQueue<FileResource>
queue,
@@ -119,7 +117,7 @@ public class RecursiveParserWrapperFSCon
//take the top metadata item
m = metadataList.remove(0);
}
- String stackTrace = exceptionFilter.getStackTrace(t);
+ String stackTrace = ExceptionUtils.getFilteredStackTrace(t);
m.add(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX+"runtime",
stackTrace);
metadataList.add(0, m);
} finally {
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java?rev=1675159&r1=1675158&r2=1675159&view=diff
==============================================================================
---
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
(original)
+++
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
Tue Apr 21 17:25:47 2015
@@ -32,6 +32,7 @@ import org.apache.tika.metadata.TikaCore
import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.mime.MediaType;
import org.apache.tika.sax.ContentHandlerFactory;
+import org.apache.tika.utils.ExceptionUtils;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
@@ -78,12 +79,14 @@ public class RecursiveParserWrapper impl
//move this to TikaCoreProperties?
public final static Property TIKA_CONTENT =
Property.internalText(TikaCoreProperties.TIKA_META_PREFIX+"content");
- public final static Property PARSE_TIME_MILLIS =
Property.internalText(TikaCoreProperties.TIKA_META_PREFIX+"parse_time_millis");
+ public final static Property PARSE_TIME_MILLIS =
Property.internalText(TikaCoreProperties.TIKA_META_PREFIX +
"parse_time_millis");
public final static Property WRITE_LIMIT_REACHED =
-
Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX+"write_limit_reached");
+
Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX +
"write_limit_reached");
public final static Property EMBEDDED_RESOURCE_LIMIT_REACHED =
-
Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX+"embedded_resource_limit_reached");
+
Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX +
"embedded_resource_limit_reached");
+ public final static Property EMBEDDED_EXCEPTION =
+
Property.internalText(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX +
"embedded_exception");
//move this to TikaCoreProperties?
public final static Property EMBEDDED_RESOURCE_PATH =
Property.internalText(TikaCoreProperties.TIKA_META_PREFIX+"embedded_resource_path");
@@ -92,16 +95,42 @@ public class RecursiveParserWrapper impl
private final ContentHandlerFactory contentHandlerFactory;
private final List<Metadata> metadatas = new LinkedList<Metadata>();
+ private final boolean catchEmbeddedExceptions;
+
//used in naming embedded resources that don't have a name.
private int unknownCount = 0;
private int maxEmbeddedResources = -1;
private boolean hitMaxEmbeddedResources = false;
-
+
+ /**
+ * Initialize the wrapper with {@link #catchEmbeddedExceptions} set
+ * to <code>true</code> as default.
+ *
+ * @param wrappedParser parser to use for the container documents and the
embedded documents
+ * @param contentHandlerFactory factory to use to generate a new content
handler for
+ * the container document and each embedded
document
+ */
public RecursiveParserWrapper(Parser wrappedParser, ContentHandlerFactory
contentHandlerFactory) {
+ this(wrappedParser, contentHandlerFactory, true);
+ }
+
+ /**
+ * Initialize the wrapper.
+ *
+ * @param wrappedParser parser to use for the container documents and the
embedded documents
+ * @param contentHandlerFactory factory to use to generate a new content
handler for
+ * the container document and each embedded
document
+ * @param catchEmbeddedExceptions whether or not to catch the embedded
exceptions.
+ * If set to <code>true</code>, the stack
traces will be stored in
+ * the metadata object with key: {@link
#EMBEDDED_EXCEPTION}.
+ */
+ public RecursiveParserWrapper(Parser wrappedParser,
+ ContentHandlerFactory contentHandlerFactory,
boolean catchEmbeddedExceptions) {
this.wrappedParser = wrappedParser;
this.contentHandlerFactory = contentHandlerFactory;
+ this.catchEmbeddedExceptions = catchEmbeddedExceptions;
}
-
+
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
return wrappedParser.getSupportedTypes(context);
@@ -244,12 +273,6 @@ public class RecursiveParserWrapper impl
}
}
-
- /**
- * Override for different behavior.
- *
- * @return handler to be used for each document
- */
private class EmbeddedParserDecorator extends ParserDecorator {
@@ -282,7 +305,7 @@ public class RecursiveParserWrapper impl
String objectLocation = this.location + objectName;
metadata.add(EMBEDDED_RESOURCE_PATH, objectLocation);
-
+
//ignore the content handler that is passed in
//and get a fresh handler
ContentHandler localHandler =
contentHandlerFactory.getNewContentHandler();
@@ -297,6 +320,25 @@ public class RecursiveParserWrapper impl
if (wlr == true) {
metadata.add(WRITE_LIMIT_REACHED, "true");
} else {
+ if (catchEmbeddedExceptions) {
+ String trace = ExceptionUtils.getStackTrace(e);
+ metadata.set(EMBEDDED_EXCEPTION, trace);
+ } else {
+ throw e;
+ }
+ }
+ } catch (IOException e) {
+ if (catchEmbeddedExceptions) {
+ String trace = ExceptionUtils.getStackTrace(e);
+ metadata.set(EMBEDDED_EXCEPTION, trace);
+ } else {
+ throw e;
+ }
+ } catch (TikaException e) {
+ if (catchEmbeddedExceptions) {
+ String trace = ExceptionUtils.getStackTrace(e);
+ metadata.set(EMBEDDED_EXCEPTION, trace);
+ } else {
throw e;
}
} finally {
Added:
tika/trunk/tika-core/src/main/java/org/apache/tika/utils/ExceptionUtils.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/utils/ExceptionUtils.java?rev=1675159&view=auto
==============================================================================
---
tika/trunk/tika-core/src/main/java/org/apache/tika/utils/ExceptionUtils.java
(added)
+++
tika/trunk/tika-core/src/main/java/org/apache/tika/utils/ExceptionUtils.java
Tue Apr 21 17:25:47 2015
@@ -0,0 +1,90 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.utils;
+
+
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.StringWriter;
+import java.io.Writer;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.exception.TikaException;
+
+public class ExceptionUtils {
+
+ private final static Pattern MSG_PATTERN = Pattern.compile(":[^\r\n]+");
+
+ /**
+ * Simple util to get stack trace.
+ * <p>
+ * This will unwrap a TikaException and return the cause if not null
+ * <p>
+ * NOTE: If your stacktraces are truncated, make sure to start your jvm
+ * with: -XX:-OmitStackTraceInFastThrow
+ *
+ * @param t throwable
+ * @return
+ * @throws IOException
+ */
+ public static String getFilteredStackTrace(Throwable t) {
+ Throwable cause = t;
+ if ((t instanceof TikaException) &&
+ t.getCause() != null) {
+ cause = t.getCause();
+ }
+ return getStackTrace(cause);
+ }
+
+ /**
+ * Get the full stacktrace as a string
+ * @param t
+ * @return
+ */
+ public static String getStackTrace(Throwable t) {
+ Writer result = new StringWriter();
+ PrintWriter writer = new PrintWriter(result);
+ t.printStackTrace(writer);
+ try {
+ writer.flush();
+ result.flush();
+ writer.close();
+ result.close();
+ } catch (IOException e) {
+ //swallow
+ }
+ return result.toString();
+ }
+
+ /**
+ * Utility method to trim the message from a stack trace
+ * string.
+ * <p>
+ * E.g. <code>java.lang.IllegalStateException: Potential loop detected
</code>
+ * will be trimmed to <code>java.lang.IllegalStateException</code>
+ * @param trace string view of stack trace
+ * @return trimmed stack trace
+ */
+ public static String trimMessage(String trace) {
+ Matcher msgMatcher = MSG_PATTERN.matcher(trace);
+ if (msgMatcher.find()) {
+ return msgMatcher.replaceFirst("");
+ }
+ return trace;
+ }
+}
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java?rev=1675159&r1=1675158&r2=1675159&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
Tue Apr 21 17:25:47 2015
@@ -18,20 +18,22 @@ package org.apache.tika.parser;
*/
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.BasicContentHandlerFactory;
-import org.apache.tika.sax.ContentHandlerFactory;
-import org.junit.Test;
-import org.xml.sax.helpers.DefaultHandler;
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
import java.io.InputStream;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertTrue;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.ContentHandlerFactory;
+import org.junit.Test;
+import org.xml.sax.helpers.DefaultHandler;
public class RecursiveParserWrapperTest {
@@ -188,15 +190,56 @@ public class RecursiveParserWrapperTest
}
assertEquals(targets, seen);
}
-
- private List<Metadata> getMetadata(Metadata metadata,
ContentHandlerFactory contentHandlerFactory)
- throws Exception{
+
+ @Test
+ public void testEmbeddedNPE() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY,
"test_recursive_embedded_npe.docx");
+ List<Metadata> list = getMetadata(metadata,
+ new
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
+ //default behavior (user doesn't specify whether or not to catch
embedded exceptions
+ //is to catch the exception
+ assertEquals(13, list.size());
+ Metadata mockNPEMetadata = list.get(10);
+ assertContains("java.lang.NullPointerException",
mockNPEMetadata.get(RecursiveParserWrapper.EMBEDDED_EXCEPTION));
+
+ metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY,
"test_recursive_embedded_npe.docx");
+ list = getMetadata(metadata,
+ new
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1),
+ false);
+
+ //Composite parser swallows caught TikaExceptions, IOExceptions and
SAXExceptions
+ //and just doesn't bother to report that there was an exception.
+ assertEquals(12, list.size());
+ }
+
+ private List<Metadata> getMetadata(Metadata metadata,
ContentHandlerFactory contentHandlerFactory,
+ boolean catchEmbeddedExceptions) throws
Exception {
ParseContext context = new ParseContext();
Parser wrapped = new AutoDetectParser();
- RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped,
contentHandlerFactory);
- InputStream stream =
RecursiveParserWrapperTest.class.getResourceAsStream(
- "/test-documents/test_recursive_embedded.docx");
- wrapper.parse(stream, new DefaultHandler(), metadata, context);
+ RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped,
+ contentHandlerFactory, catchEmbeddedExceptions);
+ String path = metadata.get(Metadata.RESOURCE_NAME_KEY);
+ if (path == null) {
+ path = "/test-documents/test_recursive_embedded.docx";
+ } else {
+ path = "/test-documents/"+path;
+ }
+ InputStream stream = null;
+ try {
+ stream = RecursiveParserWrapperTest.class.getResourceAsStream(
+ path);
+ wrapper.parse(stream, new DefaultHandler(), metadata, context);
+ } finally {
+ IOUtils.closeQuietly(stream);
+ }
return wrapper.getMetadata();
+
+ }
+
+ private List<Metadata> getMetadata(Metadata metadata,
ContentHandlerFactory contentHandlerFactory)
+ throws Exception {
+ return getMetadata(metadata, contentHandlerFactory, true);
}
}
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/test_recursive_embedded_npe.docx
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/test_recursive_embedded_npe.docx?rev=1675159&view=auto
==============================================================================
Binary file - no diff available.
Propchange:
tika/trunk/tika-parsers/src/test/resources/test-documents/test_recursive_embedded_npe.docx
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Modified:
tika/trunk/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java?rev=1675159&r1=1675158&r2=1675159&view=diff
==============================================================================
---
tika/trunk/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
(original)
+++
tika/trunk/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
Tue Apr 21 17:25:47 2015
@@ -67,9 +67,9 @@ public class RecursiveMetadataResourceTe
Reader reader = new InputStreamReader((InputStream)
response.getEntity(), IOUtils.UTF_8);
List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
- assertEquals(11, metadataList.size());
+ assertEquals(12, metadataList.size());
assertEquals("Microsoft Office Word",
metadataList.get(0).get("Application-Name"));
- assertContains("plundered our seas",
metadataList.get(5).get("X-TIKA:content"));
+ assertContains("plundered our seas",
metadataList.get(6).get("X-TIKA:content"));
}
@Test