This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_1x by this push:
     new 64fef4e  TIKA-2644 improve api for recursiveparserwrapper -- 
deconflicted
64fef4e is described below

commit 64fef4ebf77c6e7d1be47c53810986df657eb508
Author: tballison <talli...@mitre.org>
AuthorDate: Thu May 17 16:22:57 2018 -0400

    TIKA-2644 improve api for recursiveparserwrapper -- deconflicted
---
 .../apache/tika/parser/RecursiveParserWrapper.java | 344 ++++++++++++---------
 .../java/org/apache/tika/utils/ParserUtils.java    |  54 +---
 .../tika/parser/RecursiveParserWrapperTest.java    | 130 ++++++--
 3 files changed, 296 insertions(+), 232 deletions(-)

diff --git 
a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java 
b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 9484d4c..ee60f29 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -17,32 +17,30 @@ package org.apache.tika.parser;
  * limitations under the License.
  */
 
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Date;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Set;
-
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.FilenameUtils;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Property;
 import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.metadata.TikaMetadataKeys;
 import org.apache.tika.mime.MediaType;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
 import org.apache.tika.sax.ContentHandlerFactory;
-import org.apache.tika.utils.ExceptionUtils;
+import org.apache.tika.sax.RecursiveParserWrapperHandler;
+import org.apache.tika.utils.ParserUtils;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.List;
+import java.util.Set;
 
 /**
  * This is a helper class that wraps a parser in a recursive handler.
- * It takes care of setting the embedded parser in the ParseContext 
+ * It takes care of setting the embedded parser in the ParseContext
  * and handling the embedded path calculations.
  * <p>
- * After parsing a document, call getMetadata() to retrieve a list of 
+ * After parsing a document, call getMetadata() to retrieve a list of
  * Metadata objects, one for each embedded resource.  The first item
  * in the list will contain the Metadata for the outer container file.
  * <p>
@@ -52,18 +50,18 @@ import org.xml.sax.helpers.DefaultHandler;
  * <p>
  * If a WriteLimitReachedException is encountered, the wrapper will stop
  * processing the current resource, and it will not process
- * any of the child resources for the given resource.  However, it will try to 
- * parse as much as it can.  If a WLRE is reached in the parent document, 
+ * any of the child resources for the given resource.  However, it will try to
+ * parse as much as it can.  If a WLRE is reached in the parent document,
  * no child resources will be parsed.
  * <p>
  * The implementation is based on Jukka's RecursiveMetadataParser
- * and Nick's additions. See: 
+ * and Nick's additions. See:
  * <a 
href="http://wiki.apache.org/tika/RecursiveMetadata#Jukka.27s_RecursiveMetadata_Parser";>RecursiveMetadataParser</a>.
  * <p>
  * Note that this wrapper holds all data in memory and is not appropriate
  * for files with content too large to be held in memory.
  * <p>
- * Note, too, that this wrapper is not thread safe because it stores state.  
+ * Note, too, that this wrapper is not thread safe because it stores state.
  * The client must initialize a new wrapper for each thread, and the client
  * is responsible for calling {@link #reset()} after each parse.
  * <p>
@@ -71,45 +69,99 @@ import org.xml.sax.helpers.DefaultHandler;
  * </p>
  */
 public class RecursiveParserWrapper extends ParserDecorator {
-    
+
     /**
      * Generated serial version
      */
     private static final long serialVersionUID = 9086536568120690938L;
 
-    //move this to TikaCoreProperties?
-    public final static Property TIKA_CONTENT = 
Property.internalText(TikaCoreProperties.TIKA_META_PREFIX+"content");
-    public final static Property PARSE_TIME_MILLIS = 
Property.internalText(TikaCoreProperties.TIKA_META_PREFIX + 
"parse_time_millis");
+    /**
+     * @deprecated use {@link 
org.apache.tika.sax.RecursiveParserWrapperHandler#TIKA_CONTENT}
+     */
+    @Deprecated
+    public final static Property TIKA_CONTENT = 
AbstractRecursiveParserWrapperHandler.TIKA_CONTENT;
+    /**
+     * @deprecated use {@link 
org.apache.tika.sax.RecursiveParserWrapperHandler#PARSE_TIME_MILLIS}
+     */
+    @Deprecated
+    public final static Property PARSE_TIME_MILLIS = 
AbstractRecursiveParserWrapperHandler.PARSE_TIME_MILLIS;
+
+    /**
+     * @deprecated use {@link 
org.apache.tika.sax.RecursiveParserWrapperHandler#EMBEDDED_EXCEPTION}
+     */
+    @Deprecated
     public final static Property WRITE_LIMIT_REACHED =
-                
Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + 
"write_limit_reached");
-    public final static Property EMBEDDED_RESOURCE_LIMIT_REACHED = 
-                
Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + 
"embedded_resource_limit_reached");
-
-    public final static Property EMBEDDED_EXCEPTION =
-            
Property.internalText(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + 
"embedded_exception");
-    //move this to TikaCoreProperties?
-    public final static Property EMBEDDED_RESOURCE_PATH = 
-                
Property.internalText(TikaCoreProperties.TIKA_META_PREFIX+"embedded_resource_path");
- 
-    private final Parser wrappedParser;
-    private final ContentHandlerFactory contentHandlerFactory;
-    private final List<Metadata> metadatas = new LinkedList<>();
+            AbstractRecursiveParserWrapperHandler.WRITE_LIMIT_REACHED;
+    /**
+     * @deprecated use {@link 
org.apache.tika.sax.RecursiveParserWrapperHandler#EMBEDDED_RESOURCE_LIMIT_REACHED}
+     */
+    @Deprecated
+    public final static Property EMBEDDED_RESOURCE_LIMIT_REACHED =
+            
AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_LIMIT_REACHED;
+
+    /**
+     * @deprecated use {@link 
org.apache.tika.sax.RecursiveParserWrapperHandler#EMBEDDED_EXCEPTION}
+     */
+    @Deprecated
+    public final static Property EMBEDDED_EXCEPTION = 
AbstractRecursiveParserWrapperHandler.EMBEDDED_EXCEPTION;
+
+    /**
+     * @deprecated use {@link 
org.apache.tika.sax.RecursiveParserWrapperHandler#EMBEDDED_RESOURCE_PATH}
+     */
+    @Deprecated
+    public final static Property EMBEDDED_RESOURCE_PATH = 
AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH;
+
+    /**
+     * @deprecated this should be passed in via the {@link 
RecursiveParserWrapperHandler}
+     */
+    @Deprecated
+    private ContentHandlerFactory contentHandlerFactory = null;
 
     private final boolean catchEmbeddedExceptions;
 
-    //used in naming embedded resources that don't have a name.
-    private int unknownCount = 0;   
+    /**
+     * set this on the RecursiveParserWrapperHandler instead
+     * @deprecated this is here only for legacy behavior; it will be removed 
in 2.0 and/or 1.20
+     */
+    @Deprecated
     private int maxEmbeddedResources = -1;
-    private boolean hitMaxEmbeddedResources = false;
+    /**
+     * @deprecated this is here only for legacy behavior; it will be removed 
in 2.0 and/or 1.20
+     */
+    @Deprecated
+    private ParserState lastParseState = null;
 
     /**
      * Initialize the wrapper with {@link #catchEmbeddedExceptions} set
      * to <code>true</code> as default.
      *
      * @param wrappedParser parser to use for the container documents and the 
embedded documents
+     */
+    public RecursiveParserWrapper(Parser wrappedParser) {
+        this(wrappedParser, true);
+    }
+
+    /**
+     *
+     * @param wrappedParser parser to wrap
+     * @param catchEmbeddedExceptions whether or not to catch+record embedded 
exceptions.
+     *                                If set to <code>false</code>, embedded 
exceptions will be thrown and
+     *                                the rest of the file will not be parsed
+     */
+    public RecursiveParserWrapper(Parser wrappedParser, boolean 
catchEmbeddedExceptions) {
+        super(wrappedParser);
+        this.catchEmbeddedExceptions = catchEmbeddedExceptions;
+    }
+    /**
+     * Initialize the wrapper with {@link #catchEmbeddedExceptions} set
+     * to <code>true</code> as default.
+     *
+     * @param wrappedParser parser to use for the container documents and the 
embedded documents
      * @param contentHandlerFactory factory to use to generate a new content 
handler for
      *                              the container document and each embedded 
document
+     * @deprecated use {@link 
RecursiveParserWrapper#RecursiveParserWrapper(Parser)}
      */
+    @Deprecated
     public RecursiveParserWrapper(Parser wrappedParser, ContentHandlerFactory 
contentHandlerFactory) {
         this(wrappedParser, contentHandlerFactory, true);
     }
@@ -123,23 +175,24 @@ public class RecursiveParserWrapper extends 
ParserDecorator {
      * @param catchEmbeddedExceptions whether or not to catch the embedded 
exceptions.
      *                                If set to <code>true</code>, the stack 
traces will be stored in
      *                                the metadata object with key: {@link 
#EMBEDDED_EXCEPTION}.
+     * @deprecated use {@link 
RecursiveParserWrapper#RecursiveParserWrapper(Parser, boolean)}
      */
+    @Deprecated
     public RecursiveParserWrapper(Parser wrappedParser,
                                   ContentHandlerFactory contentHandlerFactory, 
boolean catchEmbeddedExceptions) {
         super(wrappedParser);
-        this.wrappedParser = wrappedParser;
         this.contentHandlerFactory = contentHandlerFactory;
         this.catchEmbeddedExceptions = catchEmbeddedExceptions;
     }
 
     @Override
     public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return wrappedParser.getSupportedTypes(context);
+        return getWrappedParser().getSupportedTypes(context);
     }
 
     /**
      * Acts like a regular parser except it ignores the ContentHandler
-     * and it automatically sets/overwrites the embedded Parser in the 
+     * and it automatically sets/overwrites the embedded Parser in the
      * ParseContext object.
      * <p>
      * To retrieve the results of the parse, use {@link #getMetadata()}.
@@ -147,173 +200,161 @@ public class RecursiveParserWrapper extends 
ParserDecorator {
      * Make sure to call {@link #reset()} after each parse.
      */
     @Override
-    public void parse(InputStream stream, ContentHandler ignore,
-            Metadata metadata, ParseContext context) throws IOException,
+    public void parse(InputStream stream, ContentHandler 
recursiveParserWrapperHandler,
+                      Metadata metadata, ParseContext context) throws 
IOException,
             SAXException, TikaException {
-
-        EmbeddedParserDecorator decorator = new EmbeddedParserDecorator("/");
+        //this tracks the state of the parent parser, per call to #parse
+        //in future versions, we can remove lastParseState, and this will be 
thread-safe
+        ParserState parserState;
+        if (recursiveParserWrapperHandler instanceof 
AbstractRecursiveParserWrapperHandler) {
+            parserState = new 
ParserState((AbstractRecursiveParserWrapperHandler)recursiveParserWrapperHandler);
+        } else {
+            parserState = new ParserState(new 
RecursiveParserWrapperHandler(contentHandlerFactory, maxEmbeddedResources));
+            lastParseState = parserState;
+        }
+        EmbeddedParserDecorator decorator = new 
EmbeddedParserDecorator(getWrappedParser(), "/", parserState);
         context.set(Parser.class, decorator);
-        ContentHandler localHandler = 
contentHandlerFactory.getNewContentHandler();
-        long started = new Date().getTime();
+        ContentHandler localHandler = 
parserState.recursiveParserWrapperHandler.getNewContentHandler();
+        long started = System.currentTimeMillis();
         try {
-            wrappedParser.parse(stream, localHandler, metadata, context);
+            getWrappedParser().parse(stream, localHandler, metadata, context);
         } catch (SAXException e) {
             boolean wlr = isWriteLimitReached(e);
             if (wlr == false) {
                 throw e;
             }
-            metadata.set(WRITE_LIMIT_REACHED, "true");
+            metadata.set(RecursiveParserWrapperHandler.WRITE_LIMIT_REACHED, 
"true");
         } finally {
-            long elapsedMillis = new Date().getTime() - started;
-            metadata.set(PARSE_TIME_MILLIS, Long.toString(elapsedMillis));
-            addContent(localHandler, metadata);
-
-            if (hitMaxEmbeddedResources) {
-                metadata.set(EMBEDDED_RESOURCE_LIMIT_REACHED, "true");
-            }
-            metadatas.add(0, deepCopy(metadata));
+            long elapsedMillis = System.currentTimeMillis() - started;
+            metadata.set(RecursiveParserWrapperHandler.PARSE_TIME_MILLIS, 
Long.toString(elapsedMillis));
+            
parserState.recursiveParserWrapperHandler.endDocument(localHandler, metadata);
         }
     }
 
     /**
-     * 
-     * The first element in the returned list represents the 
+     *
+     * The first element in the returned list represents the
      * data from the outer container file.  There is no guarantee
      * about the ordering of the list after that.
-     * 
+     *
+     * @deprecated use a {@link RecursiveParserWrapperHandler} instead
+     *
      * @return list of Metadata objects that were gathered during the parse
+     * @throws IllegalStateException if you've used a {@link 
RecursiveParserWrapperHandler} in your last
+     * call to {@link #parse(InputStream, ContentHandler, Metadata, 
ParseContext)}
      */
+    @Deprecated
     public List<Metadata> getMetadata() {
-        return metadatas;
+        if (lastParseState != null) {
+            return ((RecursiveParserWrapperHandler) 
lastParseState.recursiveParserWrapperHandler).getMetadataList();
+        } else {
+            throw new IllegalStateException("This is deprecated; please use a 
RecursiveParserWrapperHandler instead");
+        }
     }
-    
+
     /**
      * Set the maximum number of embedded resources to store.
      * If the max is hit during parsing, the {@link 
#EMBEDDED_RESOURCE_LIMIT_REACHED}
      * property will be added to the container document's Metadata.
-     * 
+     *
      * <p>
      * If this value is < 0 (the default), the wrapper will store all Metadata.
-     * 
+     * @deprecated set this on a {@link RecursiveParserWrapperHandler}
      * @param max maximum number of embedded resources to store
      */
+    @Deprecated
     public void setMaxEmbeddedResources(int max) {
         maxEmbeddedResources = max;
     }
-    
+
 
     /**
-     * This clears the metadata list and resets {@link #unknownCount} and
-     * {@link #hitMaxEmbeddedResources}
+     * This clears the last parser state (metadata list, unknown count, hit 
embeddedresource count)
+     *
+     * @deprecated use a {@link 
org.apache.tika.sax.RecursiveParserWrapperHandler} instead
+     * @throws IllegalStateException if you used a {@link 
RecursiveParserWrapper} in your call
+     * to {@link #parse(InputStream, ContentHandler, Metadata, ParseContext)}
      */
+    @Deprecated
     public void reset() {
-        metadatas.clear();
-        unknownCount = 0;
-        hitMaxEmbeddedResources = false;
+        if (lastParseState != null) {
+            lastParseState = new ParserState(new 
RecursiveParserWrapperHandler(contentHandlerFactory, maxEmbeddedResources));
+        } else {
+            throw new IllegalStateException("This is deprecated; please use a 
RecursiveParserWrapperHandler instead");
+        }
     }
-    
+
     /**
-     * Copied/modified from WriteOutContentHandler.  Couldn't make that 
-     * static, and we need to have something that will work 
+     * Copied/modified from WriteOutContentHandler.  Couldn't make that
+     * static, and we need to have something that will work
      * with exceptions thrown from both BodyContentHandler and 
WriteOutContentHandler
      * @param t
      * @return
      */
     private boolean isWriteLimitReached(Throwable t) {
-        if (t.getMessage() != null && 
+        if (t.getMessage() != null &&
                 t.getMessage().indexOf("Your document contained more than") == 
0) {
             return true;
         } else {
             return t.getCause() != null && isWriteLimitReached(t.getCause());
         }
     }
-    
-    //defensive copy
-    private Metadata deepCopy(Metadata m) {
-        Metadata clone = new Metadata();
-        
-        for (String n : m.names()){
-            if (! m.isMultiValued(n)) {
-                clone.set(n, m.get(n));
-            } else {
-                String[] vals = m.getValues(n);
-                for (int i = 0; i < vals.length; i++) {
-                    clone.add(n, vals[i]);
-                }
-            }
-        }
-        return clone;
-    }
-    
-    private String getResourceName(Metadata metadata) {
+
+    private String getResourceName(Metadata metadata, ParserState state) {
         String objectName = "";
-        if (metadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY) != null) {
-            objectName = metadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY);
-         } else if (metadata.get(TikaMetadataKeys.EMBEDDED_RELATIONSHIP_ID) != 
null) {
-            objectName = 
metadata.get(TikaMetadataKeys.EMBEDDED_RELATIONSHIP_ID);
-         } else {
-            objectName = "embedded-" + (++unknownCount);
-         }
-         //make sure that there isn't any path info in the objectName
-         //some parsers can return paths, not just file names
-         objectName = FilenameUtils.getName(objectName);
-         return objectName;
-    }
-    
-    private void addContent(ContentHandler handler, Metadata metadata) {
-        
-        if (handler.getClass().equals(DefaultHandler.class)){
-            //no-op: we can't rely on just testing for 
-            //empty content because DefaultHandler's toString()
-            //returns e.g. "org.xml.sax.helpers.DefaultHandler@6c8b1edd"
+        if (metadata.get(Metadata.RESOURCE_NAME_KEY) != null) {
+            objectName = metadata.get(Metadata.RESOURCE_NAME_KEY);
+        } else if (metadata.get(Metadata.EMBEDDED_RELATIONSHIP_ID) != null) {
+            objectName = metadata.get(Metadata.EMBEDDED_RELATIONSHIP_ID);
         } else {
-            String content = handler.toString();
-            if (content != null && content.trim().length() > 0 ) {
-                metadata.add(TIKA_CONTENT, content);
-            }
+            objectName = "embedded-" + (++state.unknownCount);
         }
-
+        //make sure that there isn't any path info in the objectName
+        //some parsers can return paths, not just file names
+        objectName = FilenameUtils.getName(objectName);
+        return objectName;
     }
 
-    
+
     private class EmbeddedParserDecorator extends ParserDecorator {
-        
+
         private static final long serialVersionUID = 207648200464263337L;
-        
+
         private String location = null;
+        private final ParserState parserState;
+
 
-        
-        private EmbeddedParserDecorator(String location) {
-            super(wrappedParser);
+        private EmbeddedParserDecorator(Parser parser, String location, 
ParserState parseState) {
+            super(parser);
             this.location = location;
             if (! this.location.endsWith("/")) {
-               this.location += "/";
+                this.location += "/";
             }
+            this.parserState = parseState;
         }
 
         @Override
         public void parse(InputStream stream, ContentHandler ignore,
-                Metadata metadata, ParseContext context) throws IOException,
+                          Metadata metadata, ParseContext context) throws 
IOException,
                 SAXException, TikaException {
             //Test to see if we should avoid parsing
-            if (maxEmbeddedResources > -1 && 
-                    metadatas.size() >= maxEmbeddedResources) {
-                hitMaxEmbeddedResources = true;
+            if 
(parserState.recursiveParserWrapperHandler.hasHitMaximumEmbeddedResources()) {
                 return;
             }
             // Work out what this thing is
-            String objectName = getResourceName(metadata);
+            String objectName = getResourceName(metadata, parserState);
             String objectLocation = this.location + objectName;
-      
-            metadata.add(EMBEDDED_RESOURCE_PATH, objectLocation);
 
-            //ignore the content handler that is passed in
-            //and get a fresh handler
-            ContentHandler localHandler = 
contentHandlerFactory.getNewContentHandler();
-            
+            
metadata.add(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH, 
objectLocation);
+
+
+            //get a fresh handler
+            ContentHandler localHandler = 
parserState.recursiveParserWrapperHandler.getNewContentHandler();
+            
parserState.recursiveParserWrapperHandler.startEmbeddedDocument(localHandler, 
metadata);
+
             Parser preContextParser = context.get(Parser.class);
-            context.set(Parser.class, new 
EmbeddedParserDecorator(objectLocation));
-            long started = new Date().getTime();
+            context.set(Parser.class, new 
EmbeddedParserDecorator(getWrappedParser(), objectLocation, parserState));
+            long started = System.currentTimeMillis();
             try {
                 super.parse(stream, localHandler, metadata, context);
             } catch (SAXException e) {
@@ -322,37 +363,44 @@ public class RecursiveParserWrapper extends 
ParserDecorator {
                     metadata.add(WRITE_LIMIT_REACHED, "true");
                 } else {
                     if (catchEmbeddedExceptions) {
-                        String trace = ExceptionUtils.getStackTrace(e);
-                        metadata.set(EMBEDDED_EXCEPTION, trace);
+                        ParserUtils.recordParserFailure(this, e, metadata);
                     } else {
                         throw e;
                     }
                 }
             } catch (TikaException e) {
                 if (catchEmbeddedExceptions) {
-                    String trace = ExceptionUtils.getStackTrace(e);
-                    metadata.set(EMBEDDED_EXCEPTION, trace);
+                    ParserUtils.recordParserFailure(this, e, metadata);
                 } else {
                     throw e;
                 }
             } finally {
                 context.set(Parser.class, preContextParser);
-                long elapsedMillis = new Date().getTime() - started;
+                long elapsedMillis = System.currentTimeMillis() - started;
                 metadata.set(PARSE_TIME_MILLIS, Long.toString(elapsedMillis));
             }
-            
+
             //Because of recursion, we need
-            //to re-test to make sure that we limit the 
+            //to re-test to make sure that we limit the
             //number of stored resources
-            if (maxEmbeddedResources > -1 && 
-                    metadatas.size() >= maxEmbeddedResources) {
-                hitMaxEmbeddedResources = true;
+            if 
(parserState.recursiveParserWrapperHandler.hasHitMaximumEmbeddedResources()) {
                 return;
             }
-            addContent(localHandler, metadata);
-            metadatas.add(deepCopy(metadata));
-        }        
+            
parserState.recursiveParserWrapperHandler.endEmbeddedDocument(localHandler, 
metadata);
+        }
     }
 
+    /**
+     * This tracks the state of the parse of a single document.
+     * In future versions, this will allow the RecursiveParserWrapper to be 
thread safe.
+     */
+    private class ParserState {
+        private int unknownCount = 0;
+        private final AbstractRecursiveParserWrapperHandler 
recursiveParserWrapperHandler;
+        private ParserState(AbstractRecursiveParserWrapperHandler handler) {
+            this.recursiveParserWrapperHandler = handler;
+        }
 
+
+    }
 }
diff --git a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java 
b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
index 02958c2..2598c99 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
@@ -16,11 +16,6 @@
  */
 package org.apache.tika.utils;
 
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.tika.io.TemporaryResources;
-import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Property;
 import org.apache.tika.metadata.TikaCoreProperties;
@@ -83,60 +78,13 @@ public class ParserUtils {
      *  {@link Exception} wasn't immediately thrown (eg when several different
      *  Parsers are used)
      */
-    public static void recordParserFailure(Parser parser, Exception failure, 
+    public static void recordParserFailure(Parser parser, Exception failure,
                                            Metadata metadata) {
         String trace = ExceptionUtils.getStackTrace(failure);
         metadata.add(EMBEDDED_EXCEPTION, trace);
         metadata.add(EMBEDDED_PARSER, getParserClassname(parser));
     }
 
-    /**
-     * Ensures that the Stream will be able to be re-read, by buffering to
-     *  a temporary file if required.
-     * Streams that are automatically OK include {@link TikaInputStream}s
-     *  created from Files or InputStreamFactories, and {@link 
RereadableInputStream}.
-     */
-    public static InputStream ensureStreamReReadable(InputStream stream, 
TemporaryResources tmp) throws IOException {
-        // If it's re-readable, we're done
-        if (stream instanceof RereadableInputStream) return stream;
 
-        // Make sure it's a TikaInputStream
-        TikaInputStream tstream = TikaInputStream.cast(stream);
-        if (tstream == null) {
-            tstream = TikaInputStream.get(stream, tmp);
-        }
 
-        // If it's factory based, it's ok
-        if (tstream.getInputStreamFactory() != null) return tstream;
-
-        // Ensure it's file based
-        tstream.getFile();
-        // Prepare for future re-reads
-        tstream.mark(-1);
-        return tstream;
-    }
-    /**
-     * Resets the given {@link TikaInputStream} (checked by 
-     *  {@link #ensureStreamReReadable(InputStream, TemporaryResources)})
-     * so that it can be re-read again.
-     */
-    public static InputStream streamResetForReRead(InputStream stream, 
TemporaryResources tmp) throws IOException {
-        // If re-readable, rewind to start
-        if (stream instanceof RereadableInputStream) {
-            ((RereadableInputStream)stream).rewind();
-            return stream;
-        }
-
-        // File or Factory based?
-        TikaInputStream tstream = (TikaInputStream)stream;
-        if (tstream.getInputStreamFactory() != null) {
-            // Just get a fresh one each time from the factory
-            return TikaInputStream.get(tstream.getInputStreamFactory(), tmp);
-        }
-
-        // File based, reset stream to beginning of File
-        tstream.reset();
-        tstream.mark(-1);
-        return tstream;
-    }
 }
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
index 4889b38..6acd190 100644
--- 
a/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
@@ -19,6 +19,7 @@ package org.apache.tika.parser;
 
 
 import static org.apache.tika.TikaTest.assertContains;
+import static org.apache.tika.TikaTest.debug;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertTrue;
@@ -32,10 +33,13 @@ import org.apache.commons.io.IOUtils;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.utils.CommonsDigester;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
 import org.apache.tika.sax.BasicContentHandlerFactory;
 import org.apache.tika.sax.ContentHandlerFactory;
+import org.apache.tika.sax.RecursiveParserWrapperHandler;
+import org.apache.tika.utils.ParserUtils;
 import org.junit.Test;
 import org.xml.sax.helpers.DefaultHandler;
 
@@ -46,7 +50,7 @@ public class RecursiveParserWrapperTest {
         List<Metadata> list = getMetadata(new Metadata(),
                 new 
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
         Metadata container = list.get(0);
-        String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
+        String content = 
container.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
         //not much differentiates html from xml in this test file
         assertTrue(content.indexOf("<p class=\"header\" />") > -1);
     }
@@ -56,7 +60,7 @@ public class RecursiveParserWrapperTest {
         List<Metadata> list = getMetadata(new Metadata(),
                 new 
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.HTML, -1));
         Metadata container = list.get(0);
-        String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
+        String content = 
container.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
         //not much differentiates html from xml in this test file
         assertTrue(content.indexOf("<p class=\"header\"></p>") > -1);
     }
@@ -66,7 +70,7 @@ public class RecursiveParserWrapperTest {
         List<Metadata> list = getMetadata(new Metadata(),
                 new 
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
         Metadata container = list.get(0);
-        String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
+        String content = 
container.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
         assertTrue(content.indexOf("<p ") < 0);
         assertTrue(content.indexOf("embed_0") > -1);
     }
@@ -76,7 +80,7 @@ public class RecursiveParserWrapperTest {
         List<Metadata> list = getMetadata(new Metadata(),
                 new 
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
         Metadata container = list.get(0);
-        String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
+        String content = 
container.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
         assertNull(content);
     }
 
@@ -87,18 +91,19 @@ public class RecursiveParserWrapperTest {
         Metadata metadata = new Metadata();
 
         Parser wrapped = new AutoDetectParser();
-        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped,
-                new 
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 60));
+        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped);
         InputStream stream = 
RecursiveParserWrapperTest.class.getResourceAsStream(
                 "/test-documents/test_recursive_embedded.docx");
-        wrapper.parse(stream, new DefaultHandler(), metadata, context);
-        List<Metadata> list = wrapper.getMetadata();
+        RecursiveParserWrapperHandler handler = new 
RecursiveParserWrapperHandler(
+                new 
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 60));
+        wrapper.parse(stream, handler, metadata, context);
+        List<Metadata> list = handler.getMetadataList();
 
         assertEquals(5, list.size());
 
         int wlr = 0;
         for (Metadata m : list) {
-            String limitReached = 
m.get(RecursiveParserWrapper.WRITE_LIMIT_REACHED);
+            String limitReached = 
m.get(AbstractRecursiveParserWrapperHandler.WRITE_LIMIT_REACHED);
             if (limitReached != null && limitReached.equals("true")) {
                 wlr++;
             }
@@ -107,8 +112,12 @@ public class RecursiveParserWrapperTest {
 
     }
 
+    /**
+     * @deprecated this will be removed in 1.20 or 2.0
+     * @throws Exception
+     */
     @Test
-    public void testMaxEmbedded() throws Exception {
+    public void testMaxEmbeddedLegacy() throws Exception {
         int maxEmbedded = 4;
         int totalNoLimit = 12;//including outer container file
         ParseContext context = new ParseContext();
@@ -126,7 +135,7 @@ public class RecursiveParserWrapperTest {
         //test default
         assertEquals(totalNoLimit, list.size());
 
-        limitReached = 
list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
+        limitReached = 
list.get(0).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_LIMIT_REACHED);
         assertNull(limitReached);
 
 
@@ -142,9 +151,9 @@ public class RecursiveParserWrapperTest {
         list = wrapper.getMetadata();
 
         //add 1 for outer container file
-        assertEquals(maxEmbedded + 1, list.size());
+        assertEquals(maxEmbedded, list.size());
 
-        limitReached = 
list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
+        limitReached = 
list.get(0).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_LIMIT_REACHED);
         assertEquals("true", limitReached);
 
         wrapper.reset();
@@ -157,11 +166,68 @@ public class RecursiveParserWrapperTest {
 
         wrapper.setMaxEmbeddedResources(-2);
         wrapper.parse(stream, new DefaultHandler(), metadata, context);
+        assertEquals(totalNoLimit, wrapper.getMetadata().size());
+        limitReached = 
wrapper.getMetadata().get(0).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_LIMIT_REACHED);
+        assertNull(limitReached);
+    }
+
+    @Test
+    public void testMaxEmbedded() throws Exception {
+        int maxEmbedded = 4;
+        int totalNoLimit = 12;//including outer container file
+        ParseContext context = new ParseContext();
+        Metadata metadata = new Metadata();
+        String limitReached = null;
+
+        Parser wrapped = new AutoDetectParser();
+        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped);
+
+        InputStream stream = 
RecursiveParserWrapperTest.class.getResourceAsStream(
+                "/test-documents/test_recursive_embedded.docx");
+        RecursiveParserWrapperHandler handler = new 
RecursiveParserWrapperHandler(
+
+                new 
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,-1));
+        wrapper.parse(stream, handler, metadata, context);
+        List<Metadata> list = handler.getMetadataList();
+        //test default
         assertEquals(totalNoLimit, list.size());
-        limitReached = 
list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
+
+        limitReached = 
list.get(0).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_LIMIT_REACHED);
+        assertNull(limitReached);
+
+        stream.close();
+
+        //test setting value
+        metadata = new Metadata();
+        stream = RecursiveParserWrapperTest.class.getResourceAsStream(
+                "/test-documents/test_recursive_embedded.docx");
+        handler = new RecursiveParserWrapperHandler(
+                new 
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), 
maxEmbedded);
+        wrapper.parse(stream, handler, metadata, context);
+        list = handler.getMetadataList();
+
+        //add 1 for outer container file
+        assertEquals(maxEmbedded, list.size());
+
+        limitReached = 
list.get(0).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_LIMIT_REACHED);
+        assertEquals("true", limitReached);
+
+        stream.close();
+
+        //test setting value < 0
+        metadata = new Metadata();
+        stream = RecursiveParserWrapperTest.class.getResourceAsStream(
+                "/test-documents/test_recursive_embedded.docx");
+        handler = new RecursiveParserWrapperHandler(
+                new 
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,-1), 
-2);
+        wrapper.parse(stream, handler, metadata, context);
+        list = handler.getMetadataList();
+        assertEquals(totalNoLimit, list.size());
+        limitReached = 
list.get(0).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_LIMIT_REACHED);
         assertNull(limitReached);
     }
 
+
     @Test
     public void testEmbeddedResourcePath() throws Exception {
 
@@ -183,12 +249,12 @@ public class RecursiveParserWrapperTest {
         List<Metadata> list = getMetadata(metadata,
                 new 
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
         Metadata container = list.get(0);
-        String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
+        String content = 
container.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
         assertTrue(content.indexOf("<p class=\"header\" />") > -1);
 
         Set<String> seen = new HashSet<String>();
         for (Metadata m : list) {
-            String path = m.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH);
+            String path = 
m.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH);
             if (path != null) {
                 seen.add(path);
             }
@@ -206,7 +272,7 @@ public class RecursiveParserWrapperTest {
         //is to catch the exception
         assertEquals(13, list.size());
         Metadata mockNPEMetadata = list.get(10);
-        assertContains("java.lang.NullPointerException", 
mockNPEMetadata.get(RecursiveParserWrapper.EMBEDDED_EXCEPTION));
+        assertContains("java.lang.NullPointerException", 
mockNPEMetadata.get(ParserUtils.EMBEDDED_EXCEPTION));
 
         metadata = new Metadata();
         metadata.set(Metadata.RESOURCE_NAME_KEY, 
"test_recursive_embedded_npe.docx");
@@ -230,8 +296,10 @@ public class RecursiveParserWrapperTest {
 
         ParseContext context = new ParseContext();
         Parser wrapped = new AutoDetectParser();
-        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped,
-                new 
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), 
true);
+        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, 
true);
+        RecursiveParserWrapperHandler handler = new 
RecursiveParserWrapperHandler(
+                new 
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
+
         String path = "/test-documents/mock/embedded_then_npe.xml";
 
         InputStream stream = null;
@@ -239,7 +307,7 @@ public class RecursiveParserWrapperTest {
         try {
             stream = RecursiveParserWrapperTest.class.getResourceAsStream(
                     path);
-            wrapper.parse(stream, new DefaultHandler(), metadata, context);
+            wrapper.parse(stream, handler, metadata, context);
         } catch (TikaException e) {
             if (e.getCause().getClass().equals(NullPointerException.class)) {
                 npe = true;
@@ -249,16 +317,16 @@ public class RecursiveParserWrapperTest {
         }
         assertTrue("npe", npe);
 
-        List<Metadata> metadataList = wrapper.getMetadata();
+        List<Metadata> metadataList = handler.getMetadataList();
         assertEquals(2, metadataList.size());
         Metadata outerMetadata = metadataList.get(0);
         Metadata embeddedMetadata = metadataList.get(1);
-        assertContains("main_content", 
outerMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
-        assertEquals("embedded_then_npe.xml", 
outerMetadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
+        assertContains("main_content", 
outerMetadata.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+        assertEquals("embedded_then_npe.xml", 
outerMetadata.get(Metadata.RESOURCE_NAME_KEY));
         assertEquals("Nikolai Lobachevsky", outerMetadata.get("author"));
 
-        assertContains("some_embedded_content", 
embeddedMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
-        assertEquals("embed1.xml", 
embeddedMetadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
+        assertContains("some_embedded_content", 
embeddedMetadata.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+        assertEquals("embed1.xml", 
embeddedMetadata.get(Metadata.RESOURCE_NAME_KEY));
         assertEquals("embeddedAuthor", embeddedMetadata.get("author"));
     }
 
@@ -268,7 +336,7 @@ public class RecursiveParserWrapperTest {
         metadata.set(Metadata.RESOURCE_NAME_KEY, 
"test_recursive_embedded.docx");
         List<Metadata> list = getMetadata(metadata,
                 new 
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1),
-                true, new CommonsDigester(100000, 
CommonsDigester.DigestAlgorithm.MD5));
+                true, new CommonsDigester(100000, "md5"));
         int i = 0;
         Metadata m0 = list.get(0);
         Metadata m6 = list.get(6);
@@ -286,8 +354,7 @@ public class RecursiveParserWrapperTest {
         if (digester != null) {
             wrapped = new DigestingParser(wrapped, digester);
         }
-        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped,
-                contentHandlerFactory, catchEmbeddedExceptions);
+        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, 
catchEmbeddedExceptions);
         String path = metadata.get(Metadata.RESOURCE_NAME_KEY);
         if (path == null) {
             path = "/test-documents/test_recursive_embedded.docx";
@@ -295,13 +362,14 @@ public class RecursiveParserWrapperTest {
             path = "/test-documents/" + path;
         }
         InputStream stream = null;
+        RecursiveParserWrapperHandler handler = new 
RecursiveParserWrapperHandler(contentHandlerFactory);
         try {
             stream = 
TikaInputStream.get(RecursiveParserWrapperTest.class.getResource(path).toURI());
-            wrapper.parse(stream, new DefaultHandler(), metadata, context);
+            wrapper.parse(stream, handler, metadata, context);
         } finally {
             IOUtils.closeQuietly(stream);
         }
-        return wrapper.getMetadata();
+        return handler.getMetadataList();
 
     }
 

-- 
To stop receiving notification emails like this one, please contact
talli...@apache.org.

Reply via email to