Author: tallison
Date: Tue Apr 21 17:25:47 2015
New Revision: 1675159

URL: http://svn.apache.org/r1675159
Log:
TIKA-1611 -- allow RecursiveParserWrapper to catch exceptions caused by 
embedded documents

Added:
    tika/trunk/tika-core/src/main/java/org/apache/tika/utils/ExceptionUtils.java
    
tika/trunk/tika-parsers/src/test/resources/test-documents/test_recursive_embedded_npe.docx
   (with props)
Removed:
    
tika/trunk/tika-batch/src/main/java/org/apache/tika/util/TikaExceptionFilter.java
    tika/trunk/tika-batch/src/test/java/org/apache/tika/util/
Modified:
    
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java
    
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
    
tika/trunk/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java

Modified: 
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java?rev=1675159&r1=1675158&r2=1675159&view=diff
==============================================================================
--- 
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java
 (original)
+++ 
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java
 Tue Apr 21 17:25:47 2015
@@ -37,7 +37,7 @@ import org.apache.tika.parser.ParseConte
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.sax.ContentHandlerFactory;
-import org.apache.tika.util.TikaExceptionFilter;
+import org.apache.tika.utils.ExceptionUtils;
 import org.xml.sax.helpers.DefaultHandler;
 
 /**
@@ -55,8 +55,6 @@ public class RecursiveParserWrapperFSCon
     private final OutputStreamFactory fsOSFactory;
     private final TikaConfig tikaConfig;
     private String outputEncoding = "UTF-8";
-    //TODO: parameterize this
-    private TikaExceptionFilter exceptionFilter = new TikaExceptionFilter();
 
 
     public RecursiveParserWrapperFSConsumer(ArrayBlockingQueue<FileResource> 
queue,
@@ -119,7 +117,7 @@ public class RecursiveParserWrapperFSCon
                 //take the top metadata item
                 m = metadataList.remove(0);
             }
-            String stackTrace = exceptionFilter.getStackTrace(t);
+            String stackTrace = ExceptionUtils.getFilteredStackTrace(t);
             m.add(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX+"runtime", 
stackTrace);
             metadataList.add(0, m);
         } finally {

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java?rev=1675159&r1=1675158&r2=1675159&view=diff
==============================================================================
--- 
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
 (original)
+++ 
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
 Tue Apr 21 17:25:47 2015
@@ -32,6 +32,7 @@ import org.apache.tika.metadata.TikaCore
 import org.apache.tika.metadata.TikaMetadataKeys;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.sax.ContentHandlerFactory;
+import org.apache.tika.utils.ExceptionUtils;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
@@ -78,12 +79,14 @@ public class RecursiveParserWrapper impl
 
     //move this to TikaCoreProperties?
     public final static Property TIKA_CONTENT = 
Property.internalText(TikaCoreProperties.TIKA_META_PREFIX+"content");
-    public final static Property PARSE_TIME_MILLIS = 
Property.internalText(TikaCoreProperties.TIKA_META_PREFIX+"parse_time_millis");
+    public final static Property PARSE_TIME_MILLIS = 
Property.internalText(TikaCoreProperties.TIKA_META_PREFIX + 
"parse_time_millis");
     public final static Property WRITE_LIMIT_REACHED =
-                
Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX+"write_limit_reached");
+                
Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + 
"write_limit_reached");
     public final static Property EMBEDDED_RESOURCE_LIMIT_REACHED = 
-                
Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX+"embedded_resource_limit_reached");
+                
Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + 
"embedded_resource_limit_reached");
 
+    public final static Property EMBEDDED_EXCEPTION =
+            
Property.internalText(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + 
"embedded_exception");
     //move this to TikaCoreProperties?
     public final static Property EMBEDDED_RESOURCE_PATH = 
                 
Property.internalText(TikaCoreProperties.TIKA_META_PREFIX+"embedded_resource_path");
@@ -92,16 +95,42 @@ public class RecursiveParserWrapper impl
     private final ContentHandlerFactory contentHandlerFactory;
     private final List<Metadata> metadatas = new LinkedList<Metadata>();
 
+    private final boolean catchEmbeddedExceptions;
+
     //used in naming embedded resources that don't have a name.
     private int unknownCount = 0;   
     private int maxEmbeddedResources = -1;
     private boolean hitMaxEmbeddedResources = false;
-    
+
+    /**
+     * Initialize the wrapper with {@link #catchEmbeddedExceptions} set
+     * to <code>true</code> as default.
+     *
+     * @param wrappedParser parser to use for the container documents and the 
embedded documents
+     * @param contentHandlerFactory factory to use to generate a new content 
handler for
+     *                              the container document and each embedded 
document
+     */
     public RecursiveParserWrapper(Parser wrappedParser, ContentHandlerFactory 
contentHandlerFactory) {
+        this(wrappedParser, contentHandlerFactory, true);
+    }
+
+    /**
+     * Initialize the wrapper.
+     *
+     * @param wrappedParser parser to use for the container documents and the 
embedded documents
+     * @param contentHandlerFactory factory to use to generate a new content 
handler for
+     *                              the container document and each embedded 
document
+     * @param catchEmbeddedExceptions whether or not to catch the embedded 
exceptions.
+     *                                If set to <code>true</code>, the stack 
traces will be stored in
+     *                                the metadata object with key: {@link 
#EMBEDDED_EXCEPTION}.
+     */
+    public RecursiveParserWrapper(Parser wrappedParser,
+                                  ContentHandlerFactory contentHandlerFactory, 
boolean catchEmbeddedExceptions) {
         this.wrappedParser = wrappedParser;
         this.contentHandlerFactory = contentHandlerFactory;
+        this.catchEmbeddedExceptions = catchEmbeddedExceptions;
     }
-    
+
     @Override
     public Set<MediaType> getSupportedTypes(ParseContext context) {
         return wrappedParser.getSupportedTypes(context);
@@ -244,12 +273,6 @@ public class RecursiveParserWrapper impl
         }
 
     }
-    
-    /**
-     * Override for different behavior.
-     * 
-     * @return handler to be used for each document
-     */
 
     
     private class EmbeddedParserDecorator extends ParserDecorator {
@@ -282,7 +305,7 @@ public class RecursiveParserWrapper impl
             String objectLocation = this.location + objectName;
       
             metadata.add(EMBEDDED_RESOURCE_PATH, objectLocation);
-            
+
             //ignore the content handler that is passed in
             //and get a fresh handler
             ContentHandler localHandler = 
contentHandlerFactory.getNewContentHandler();
@@ -297,6 +320,25 @@ public class RecursiveParserWrapper impl
                 if (wlr == true) {
                     metadata.add(WRITE_LIMIT_REACHED, "true");
                 } else {
+                    if (catchEmbeddedExceptions) {
+                        String trace = ExceptionUtils.getStackTrace(e);
+                        metadata.set(EMBEDDED_EXCEPTION, trace);
+                    } else {
+                        throw e;
+                    }
+                }
+            } catch (IOException e) {
+                if (catchEmbeddedExceptions) {
+                    String trace = ExceptionUtils.getStackTrace(e);
+                    metadata.set(EMBEDDED_EXCEPTION, trace);
+                } else {
+                    throw e;
+                }
+            } catch (TikaException e) {
+                if (catchEmbeddedExceptions) {
+                    String trace = ExceptionUtils.getStackTrace(e);
+                    metadata.set(EMBEDDED_EXCEPTION, trace);
+                } else {
                     throw e;
                 }
             } finally {

Added: 
tika/trunk/tika-core/src/main/java/org/apache/tika/utils/ExceptionUtils.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/utils/ExceptionUtils.java?rev=1675159&view=auto
==============================================================================
--- 
tika/trunk/tika-core/src/main/java/org/apache/tika/utils/ExceptionUtils.java 
(added)
+++ 
tika/trunk/tika-core/src/main/java/org/apache/tika/utils/ExceptionUtils.java 
Tue Apr 21 17:25:47 2015
@@ -0,0 +1,90 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.utils;
+
+
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.StringWriter;
+import java.io.Writer;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.exception.TikaException;
+
+public class ExceptionUtils {
+
+    private final static Pattern MSG_PATTERN = Pattern.compile(":[^\r\n]+");
+
+    /**
+     * Simple util to get stack trace.
+     * <p>
+     * This will unwrap a TikaException and return the cause if not null
+     * <p>
+     * NOTE: If your stacktraces are truncated, make sure to start your jvm
+     * with: -XX:-OmitStackTraceInFastThrow
+     *
+     * @param t throwable
+     * @return
+     * @throws IOException
+     */
+    public static String getFilteredStackTrace(Throwable t) {
+        Throwable cause = t;
+        if ((t instanceof TikaException) &&
+                t.getCause() != null) {
+            cause = t.getCause();
+        }
+        return getStackTrace(cause);
+    }
+
+    /**
+     * Get the full stacktrace as a string
+     * @param t
+     * @return
+     */
+    public static String getStackTrace(Throwable t) {
+        Writer result = new StringWriter();
+        PrintWriter writer = new PrintWriter(result);
+        t.printStackTrace(writer);
+        try {
+            writer.flush();
+            result.flush();
+            writer.close();
+            result.close();
+        } catch (IOException e) {
+            //swallow
+        }
+        return result.toString();
+    }
+
+    /**
+     * Utility method to trim the message from a stack trace
+     * string.
+     * <p>
+     * E.g. <code>java.lang.IllegalStateException: Potential loop detected 
</code>
+     * will be trimmed to <code>java.lang.IllegalStateException</code>
+     * @param trace string view of stack trace
+     * @return trimmed stack trace
+     */
+    public static String trimMessage(String trace) {
+        Matcher msgMatcher = MSG_PATTERN.matcher(trace);
+        if (msgMatcher.find()) {
+            return msgMatcher.replaceFirst("");
+        }
+        return trace;
+    }
+}

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java?rev=1675159&r1=1675158&r2=1675159&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
 Tue Apr 21 17:25:47 2015
@@ -18,20 +18,22 @@ package org.apache.tika.parser;
  */
 
 
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.BasicContentHandlerFactory;
-import org.apache.tika.sax.ContentHandlerFactory;
-import org.junit.Test;
-import org.xml.sax.helpers.DefaultHandler;
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
 
 import java.io.InputStream;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
 
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertTrue;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.ContentHandlerFactory;
+import org.junit.Test;
+import org.xml.sax.helpers.DefaultHandler;
 
 public class RecursiveParserWrapperTest {
 
@@ -188,15 +190,56 @@ public class RecursiveParserWrapperTest
         }
         assertEquals(targets, seen);
     }
-    
-    private List<Metadata> getMetadata(Metadata metadata, 
ContentHandlerFactory contentHandlerFactory)
-            throws Exception{
+
+    @Test
+    public void testEmbeddedNPE() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.RESOURCE_NAME_KEY, 
"test_recursive_embedded_npe.docx");
+        List<Metadata> list = getMetadata(metadata,
+                new 
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
+        //default behavior (user doesn't specify whether or not to catch 
embedded exceptions
+        //is to catch the exception
+        assertEquals(13, list.size());
+        Metadata mockNPEMetadata = list.get(10);
+        assertContains("java.lang.NullPointerException", 
mockNPEMetadata.get(RecursiveParserWrapper.EMBEDDED_EXCEPTION));
+
+        metadata = new Metadata();
+        metadata.set(Metadata.RESOURCE_NAME_KEY, 
"test_recursive_embedded_npe.docx");
+        list = getMetadata(metadata,
+                new 
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1),
+                false);
+
+        //Composite parser swallows caught TikaExceptions, IOExceptions and 
SAXExceptions
+        //and just doesn't bother to report that there was an exception.
+        assertEquals(12, list.size());
+    }
+
+    private List<Metadata> getMetadata(Metadata metadata, 
ContentHandlerFactory contentHandlerFactory,
+                                       boolean catchEmbeddedExceptions) throws 
Exception {
         ParseContext context = new ParseContext();
         Parser wrapped = new AutoDetectParser();
-        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, 
contentHandlerFactory);
-        InputStream stream = 
RecursiveParserWrapperTest.class.getResourceAsStream(
-                "/test-documents/test_recursive_embedded.docx");
-        wrapper.parse(stream, new DefaultHandler(), metadata, context);
+        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped,
+                contentHandlerFactory, catchEmbeddedExceptions);
+        String path = metadata.get(Metadata.RESOURCE_NAME_KEY);
+        if (path == null) {
+            path = "/test-documents/test_recursive_embedded.docx";
+        } else {
+            path = "/test-documents/"+path;
+        }
+        InputStream stream = null;
+        try {
+            stream = RecursiveParserWrapperTest.class.getResourceAsStream(
+                    path);
+            wrapper.parse(stream, new DefaultHandler(), metadata, context);
+        } finally {
+            IOUtils.closeQuietly(stream);
+        }
         return wrapper.getMetadata();
+
+    }
+
+    private List<Metadata> getMetadata(Metadata metadata, 
ContentHandlerFactory contentHandlerFactory)
+            throws Exception {
+        return getMetadata(metadata, contentHandlerFactory, true);
     }
 }

Added: 
tika/trunk/tika-parsers/src/test/resources/test-documents/test_recursive_embedded_npe.docx
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/test_recursive_embedded_npe.docx?rev=1675159&view=auto
==============================================================================
Binary file - no diff available.

Propchange: 
tika/trunk/tika-parsers/src/test/resources/test-documents/test_recursive_embedded_npe.docx
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Modified: 
tika/trunk/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java?rev=1675159&r1=1675158&r2=1675159&view=diff
==============================================================================
--- 
tika/trunk/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
 (original)
+++ 
tika/trunk/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
 Tue Apr 21 17:25:47 2015
@@ -67,9 +67,9 @@ public class RecursiveMetadataResourceTe
         Reader reader = new InputStreamReader((InputStream) 
response.getEntity(), IOUtils.UTF_8);
         List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
 
-        assertEquals(11, metadataList.size());
+        assertEquals(12, metadataList.size());
         assertEquals("Microsoft Office Word", 
metadataList.get(0).get("Application-Name"));
-        assertContains("plundered our seas", 
metadataList.get(5).get("X-TIKA:content"));
+        assertContains("plundered our seas", 
metadataList.get(6).get("X-TIKA:content"));
     }
 
     @Test


Reply via email to