Author: tallison
Date: Sun Jun 28 01:57:30 2015
New Revision: 1687981

URL: http://svn.apache.org/r1687981
Log:
TIKA-1663 add a DigestingParser

Added:
    tika/trunk/tika-app/src/main/java/org/apache/tika/batch/
    
tika/trunk/tika-app/src/main/java/org/apache/tika/batch/DigestingAutoDetectParserFactory.java
    tika/trunk/tika-app/src/main/java/org/apache/tika/batch/builders/
    
tika/trunk/tika-app/src/main/java/org/apache/tika/batch/builders/AppParserFactoryBuilder.java
    tika/trunk/tika-app/src/main/resources/tika-app-batch-config.xml
    
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/IParserFactoryBuilder.java
    
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/ParserFactoryBuilder.java
    
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/utils/
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/utils/CommonsDigester.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java
Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
    tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
    
tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java
    tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
    tika/trunk/tika-app/src/test/resources/log4j_batch_process_test.properties
    
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/AutoDetectParserFactory.java
    tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/ParserFactory.java
    
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/FSBatchProcessCLI.java
    
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java
    
tika/trunk/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml
    
tika/trunk/tika-batch/src/test/java/org/apache/tika/parser/mock/MockParserFactory.java
    
tika/trunk/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml
    tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml
    tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
    
tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
    
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/DetectorResource.java
    
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/LanguageResource.java
    
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java
    
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
    
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaDetectors.java
    
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaMimeTypes.java
    
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaParsers.java
    
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
    
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaUtils.java
    
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaVersion.java
    
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaWelcome.java
    
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TranslateResource.java
    
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java
    tika/trunk/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
    
tika/trunk/tika-server/src/test/java/org/apache/tika/server/DetectorResourceTest.java
    
tika/trunk/tika-server/src/test/java/org/apache/tika/server/LanguageResourceTest.java
    
tika/trunk/tika-server/src/test/java/org/apache/tika/server/MetadataResourceTest.java
    
tika/trunk/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
    
tika/trunk/tika-server/src/test/java/org/apache/tika/server/StackTraceOffTest.java
    
tika/trunk/tika-server/src/test/java/org/apache/tika/server/StackTraceTest.java
    
tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaDetectorsTest.java
    
tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaMimeTypesTest.java
    
tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaParsersTest.java
    
tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
    
tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaVersionTest.java
    
tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaWelcomeTest.java
    
tika/trunk/tika-server/src/test/java/org/apache/tika/server/TranslateResourceTest.java
    
tika/trunk/tika-server/src/test/java/org/apache/tika/server/UnpackerResourceTest.java

Modified: tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Sun Jun 28 01:57:30 2015
@@ -1,5 +1,9 @@
 Release 1.10 - Current Development
 
+  * Added DigestingParser to calculate digest hashes 
+    and record them in metadata. Integrated with
+    tika-app and tika-server (TIKA-1663).
+
   * Fixed ZipContainerDetector to detect all IPA files
     (TIKA-1659).
 

Added: 
tika/trunk/tika-app/src/main/java/org/apache/tika/batch/DigestingAutoDetectParserFactory.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/batch/DigestingAutoDetectParserFactory.java?rev=1687981&view=auto
==============================================================================
--- 
tika/trunk/tika-app/src/main/java/org/apache/tika/batch/DigestingAutoDetectParserFactory.java
 (added)
+++ 
tika/trunk/tika-app/src/main/java/org/apache/tika/batch/DigestingAutoDetectParserFactory.java
 Sun Jun 28 01:57:30 2015
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.batch;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.DigestingParser;
+import org.apache.tika.parser.Parser;
+
+public class DigestingAutoDetectParserFactory extends ParserFactory {
+
+    private DigestingParser.Digester digester = null;
+
+
+    @Override
+    public Parser getParser(TikaConfig config) {
+        Parser p = new AutoDetectParser(config);
+        if (digester == null) {
+            return p;
+        }
+        DigestingParser d = new DigestingParser(p, digester);
+        return d;
+    }
+
+    public void setDigester(DigestingParser.Digester digester) {
+        this.digester = digester;
+    }
+}

Added: 
tika/trunk/tika-app/src/main/java/org/apache/tika/batch/builders/AppParserFactoryBuilder.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/batch/builders/AppParserFactoryBuilder.java?rev=1687981&view=auto
==============================================================================
--- 
tika/trunk/tika-app/src/main/java/org/apache/tika/batch/builders/AppParserFactoryBuilder.java
 (added)
+++ 
tika/trunk/tika-app/src/main/java/org/apache/tika/batch/builders/AppParserFactoryBuilder.java
 Sun Jun 28 01:57:30 2015
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.batch.builders;
+
+import java.util.Locale;
+import java.util.Map;
+
+import org.apache.tika.batch.DigestingAutoDetectParserFactory;
+import org.apache.tika.batch.ParserFactory;
+import org.apache.tika.parser.DigestingParser;
+import org.apache.tika.parser.utils.CommonsDigester;
+import org.apache.tika.util.ClassLoaderUtil;
+import org.apache.tika.util.XMLDOMUtil;
+import org.w3c.dom.Node;
+
+public class AppParserFactoryBuilder implements IParserFactoryBuilder {
+
+    @Override
+    public ParserFactory build(Node node, Map<String, String> runtimeAttrs) {
+        Map<String, String> localAttrs = XMLDOMUtil.mapifyAttrs(node, 
runtimeAttrs);
+        String className = localAttrs.get("class");
+        ParserFactory pf = ClassLoaderUtil.buildClass(ParserFactory.class, 
className);
+
+        if (localAttrs.containsKey("parseRecursively")) {
+            String bString = 
localAttrs.get("parseRecursively").toLowerCase(Locale.ENGLISH);
+            if (bString.equals("true")) {
+                pf.setParseRecursively(true);
+            } else if (bString.equals("false")) {
+                pf.setParseRecursively(false);
+            } else {
+                throw new RuntimeException("parseRecursively must have value 
of \"true\" or \"false\": "+
+                        bString);
+            }
+        }
+        if (pf instanceof DigestingAutoDetectParserFactory) {
+            DigestingParser.Digester d = buildDigester(localAttrs);
+            ((DigestingAutoDetectParserFactory)pf).setDigester(d);
+        }
+        return pf;
+    }
+
+    private DigestingParser.Digester buildDigester(Map<String, String> 
localAttrs) {
+        String digestString = localAttrs.get("digest");
+        CommonsDigester.DigestAlgorithm[] algos = 
CommonsDigester.parse(digestString);
+
+        String readLimitString = localAttrs.get("digestMarkLimit");
+        if (readLimitString == null) {
+            throw new IllegalArgumentException("Must specify 
\"digestMarkLimit\" for "+
+            "the DigestingAutoDetectParserFactory");
+        }
+        int readLimit = -1;
+
+        try {
+            readLimit = Integer.parseInt(readLimitString);
+        } catch (NumberFormatException e) {
+            throw new IllegalArgumentException("Parameter \"digestMarkLimit\" 
must be a parseable int: "+
+            readLimitString);
+        }
+        return new CommonsDigester(readLimit, algos);
+    }
+}

Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java 
(original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Sun Jun 
28 01:57:30 2015
@@ -88,6 +88,7 @@ import org.apache.tika.mime.MimeTypeExce
 import org.apache.tika.mime.MimeTypes;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.CompositeParser;
+import org.apache.tika.parser.DigestingParser;
 import org.apache.tika.parser.NetworkParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
@@ -95,6 +96,7 @@ import org.apache.tika.parser.ParserDeco
 import org.apache.tika.parser.PasswordProvider;
 import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.parser.html.BoilerpipeContentHandler;
+import org.apache.tika.parser.utils.CommonsDigester;
 import org.apache.tika.sax.BasicContentHandlerFactory;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.ContentHandlerFactory;
@@ -108,6 +110,9 @@ import org.xml.sax.helpers.DefaultHandle
  * Simple command line interface for Apache Tika.
  */
 public class TikaCLI {
+
+    private final int MAX_MARK = 20*1024*1024;//20MB
+
     private File extractDir = new File(".");
 
     private static final Log logger = LogFactory.getLog(TikaCLI.class);
@@ -334,6 +339,8 @@ public class TikaCLI {
      */
     private String password = System.getenv("TIKA_PASSWORD");
 
+    private DigestingParser.Digester digester = null;
+
     private boolean pipeMode = true;
 
     private boolean serverMode = false;
@@ -400,6 +407,11 @@ public class TikaCLI {
             fork = true;
         } else if (arg.startsWith("--config=")) {
             configure(arg.substring("--config=".length()));
+        } else if (arg.startsWith("--digest=")) {
+            CommonsDigester.DigestAlgorithm[] algos = CommonsDigester.parse(
+                    arg.substring("--digest=".length()));
+            digester = new CommonsDigester(MAX_MARK,algos);
+            parser = new DigestingParser(parser, digester);
         } else if (arg.startsWith("-e")) {
             encoding = arg.substring("-e".length());
         } else if (arg.startsWith("--encoding=")) {
@@ -545,6 +557,8 @@ public class TikaCLI {
         out.println("                           with -x, -h, -t or -m; default 
is -x)");
         out.println("    -l  or --language      Output only language");
         out.println("    -d  or --detect        Detect document type");
+        out.println("           --digest=X      Include digest X (md2, md5, 
sha1,");
+        out.println("                               sha256, sha384, sha512");
         out.println("    -eX or --encoding=X    Use output encoding X");
         out.println("    -pX or --password=X    Use document password X");
         out.println("    -z  or --extract       Extract all attachements into 
current directory");
@@ -662,6 +676,9 @@ public class TikaCLI {
         this.configFilePath = configFilePath;
         TikaConfig config = new TikaConfig(new File(configFilePath));
         parser = new AutoDetectParser(config);
+        if (digester != null) {
+            parser = new DigestingParser(parser, digester);
+        }
         detector = config.getDetector();
         context.set(Parser.class, parser);
     }

Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java 
(original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java Sun Jun 
28 01:57:30 2015
@@ -47,6 +47,10 @@ import java.awt.event.ActionEvent;
 import java.awt.event.ActionListener;
 import java.awt.event.KeyEvent;
 import java.awt.event.WindowEvent;
+import java.awt.event.ActionEvent;
+import java.awt.event.ActionListener;
+import java.awt.event.KeyEvent;
+import java.awt.event.WindowEvent;
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
@@ -71,10 +75,12 @@ import org.apache.tika.metadata.serializ
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.DigestingParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.parser.html.BoilerpipeContentHandler;
+import org.apache.tika.parser.utils.CommonsDigester;
 import org.apache.tika.sax.BasicContentHandlerFactory;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.ContentHandlerDecorator;
@@ -92,6 +98,9 @@ import org.xml.sax.helpers.AttributesImp
 public class TikaGUI extends JFrame
         implements ActionListener, HyperlinkListener {
 
+    //maximum length to allow for mark for reparse to get JSON
+    private static final int MAX_MARK = 20*1024*1024;//20MB
+
     /**
      * Serial version UID.
      */
@@ -115,13 +124,16 @@ public class TikaGUI extends JFrame
         final TikaConfig finalConfig = config;
         SwingUtilities.invokeLater(new Runnable() {
             public void run() {
-                new TikaGUI(new 
AutoDetectParser(finalConfig)).setVisible(true);
+                new TikaGUI(new DigestingParser(
+                        new AutoDetectParser(finalConfig),
+                        new CommonsDigester(MAX_MARK,
+                                CommonsDigester.DigestAlgorithm.MD5,
+                                CommonsDigester.DigestAlgorithm.SHA256)
+                        )).setVisible(true);
             }
         });
     }
 
-    //maximum length to allow for mark for reparse to get JSON
-    private final int MAX_MARK = 20*1024*1024;//20MB
     /**
      * Parsing context.
      */
@@ -334,11 +346,22 @@ public class TikaGUI extends JFrame
                 getXmlContentHandler(xmlBuffer));
 
         context.set(DocumentSelector.class, new ImageDocumentSelector());
+
+        input = TikaInputStream.get(new ProgressMonitorInputStream(
+                this, "Parsing stream", input));
+
         if (input.markSupported()) {
-            input.mark(MAX_MARK);
+            int mark = -1;
+            if (input instanceof TikaInputStream) {
+                if (((TikaInputStream)input).hasFile()) {
+                    mark = (int)((TikaInputStream)input).getLength();
+                }
+            }
+            if (mark == -1) {
+                mark = MAX_MARK;
+            }
+            input.mark(mark);
         }
-        input = new ProgressMonitorInputStream(
-                this, "Parsing stream", input);
         parser.parse(input, handler, md, context);
 
         String[] names = md.names();

Added: tika/trunk/tika-app/src/main/resources/tika-app-batch-config.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/resources/tika-app-batch-config.xml?rev=1687981&view=auto
==============================================================================
--- tika/trunk/tika-app/src/main/resources/tika-app-batch-config.xml (added)
+++ tika/trunk/tika-app/src/main/resources/tika-app-batch-config.xml Sun Jun 28 
01:57:30 2015
@@ -0,0 +1,136 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
+
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<!-- NOTE: tika-batch is still an experimental feature.
+    The configuration file will likely change and be backward incompatible
+    with new versions of Tika.  Please stay tuned.
+    -->
+
+<tika-batch-config
+        maxAliveTimeSeconds="-1"
+        pauseOnEarlyTerminationMillis="10000"
+        timeoutThresholdMillis="300000"
+        timeoutCheckPulseMillis="1000"
+        maxQueueSize="10000"
+        numConsumers="default"> <!-- numConsumers = number of file consumers, 
"default" = number of processors -1 -->
+
+    <!-- options to allow on the commandline -->
+    <commandline>
+        <option opt="c" longOpt="tika-config" hasArg="true"
+                description="TikaConfig file"/>
+        <option opt="bc" longOpt="batch-config" hasArg="true"
+                description="xml batch config file"/>
+        <!-- We needed sorted for testing.  We added random for performance.
+             Where crawling a directory is slow, it might be beneficial to
+             go randomly so that the parsers are triggered earlier.  The
+             default is operating system's choice ("os") which means whatever 
order
+             the os returns files in .listFiles(). -->
+        <option opt="crawlOrder" hasArg="true"
+                description="how does the crawler sort the directories and 
files:
+                                (random|sorted|os)"/>
+        <option opt="numConsumers" hasArg="true"
+                description="number of fileConsumers threads"/>
+        <option opt="maxFileSizeBytes" hasArg="true"
+                description="maximum file size to process; do not process 
files larger than this"/>
+        <option opt="maxQueueSize" hasArg="true"
+                description="maximum queue size for FileResources"/>
+        <option opt="fileList" hasArg="true"
+                description="file that contains a list of files (relative to 
inputDir) to process"/>
+        <option opt="fileListEncoding" hasArg="true"
+                description="encoding for fileList"/>
+        <option opt="inputDir" hasArg="true"
+                description="root directory for the files to be processed"/>
+        <option opt="startDir" hasArg="true"
+                description="directory (under inputDir) at which to start 
crawling"/>
+        <option opt="outputDir" hasArg="true"
+                description="output directory for output"/> <!-- do we want to 
make this mandatory -->
+        <option opt="recursiveParserWrapper"
+                description="use the RecursiveParserWrapper or not (default = 
false)"/>
+        <option opt="handleExisting" hasArg="true"
+                description="if an output file already exists, do you want to: 
overwrite, rename or skip"/>
+        <option opt="basicHandlerType" hasArg="true"
+                description="what type of content handler: xml, text, html, 
body"/>
+        <option opt="outputSuffix" hasArg="true"
+                description="suffix to add to the end of the output file 
name"/>
+        <option opt="timeoutThresholdMillis" hasArg="true"
+                description="how long to wait before determining that a 
consumer is stale"/>
+        <option opt="includeFilePat" hasArg="true"
+                description="regex that specifies which files to process"/>
+        <option opt="excludeFilePat" hasArg="true"
+                description="regex that specifies which files to avoid 
processing"/>
+        <option opt="reporterSleepMillis" hasArg="true"
+                description="millisecond between reports by the reporter"/>
+        <option opt="digest" hasArg="true"
+                description="which digest(s) to use, e.g. 'md5,sha512'\"/>
+        <option opt="digestMarkLimit" hasArg="true"
+                description="max bytes to read for digest\"/>
+    </commandline>
+
+
+    <!-- can specify inputDir="input", but the default config should not 
include this -->
+    <!-- can also specify startDir="input/someDir" to specify which child 
directory
+         to start processing -->
+       <crawler 
builderClass="org.apache.tika.batch.fs.builders.FSCrawlerBuilder"
+        crawlOrder="random"
+               maxFilesToAdd="-1" 
+               maxFilesToConsider="-1" 
+               includeFilePat=""
+               excludeFilePat=""
+               maxFileSizeBytes="-1"
+        />
+<!--
+    This is an example of a crawler that reads a list of files to be processed 
from a
+    file.  This assumes that the files in the list are relative to inputDir.
+    <crawler class="org.apache.tika.batch.fs.builders.FSCrawlerBuilder"
+             fileList="files.txt"
+             fileListEncoding="UTF-8"
+             maxFilesToAdd="-1"
+             maxFilesToConsider="-1"
+             includeFilePat="(?i).pdf$"
+             excludeFilePat="(?i).msg$"
+             maxFileSizeBytes="-1"
+             inputDir="input"
+    />
+-->
+    <!--
+        To wrap parser in RecursiveParserWrapper (tika-app's -J or 
tika-server's /rmeta),
+        add attribute recursiveParserWrapper="true" to consumers element.
+
+        To wrap parser with DigestingParser add attributes e.g.:
+        digest="md5,sha256" digestMarkLimit="10000000"
+        -->
+    <consumers 
builderClass="org.apache.tika.batch.fs.builders.BasicTikaFSConsumersBuilder"
+               recursiveParserWrapper="false" 
consumersManagerMaxMillis="60000">
+        <parser 
builderClass="org.apache.tika.batch.builders.AppParserFactoryBuilder"
+                class="org.apache.tika.batch.DigestingAutoDetectParserFactory"
+                parseRecursively="true"
+                digest="md5" digestMarkLimit="1000000"/>
+        <contenthandler 
builderClass="org.apache.tika.batch.builders.DefaultContentHandlerFactoryBuilder"
+                        basicHandlerType="xml" writeLimit="-1"/>
+        <!-- overwritePolicy: "skip" a file if output file exists, "rename" a 
output file, "overwrite" -->
+        <!-- can include e.g. outputDir="output", but we don't want to include 
this in the default! -->
+        <outputstream class="FSOutputStreamFactory" encoding="UTF-8" 
outputSuffix="xml"/>
+    </consumers>
+
+    <!-- reporter and interrupter are optional -->
+    <reporter 
builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" 
reporterSleepMillis="1000"
+              reporterStaleThresholdMillis="60000"/>
+    <interrupter 
builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
+</tika-batch-config>
\ No newline at end of file

Modified: 
tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
--- 
tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java
 (original)
+++ 
tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java
 Sun Jun 28 01:57:30 2015
@@ -18,6 +18,7 @@
 package org.apache.tika.cli;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;
 
 import java.io.ByteArrayOutputStream;
@@ -126,6 +127,49 @@ public class TikaCLIBatchIntegrationTest
         assertTrue(sysOutString.contains("MY_CUSTOM_LOG_CONFIG"));
     }
 
+    @Test
+    public void testDigester() throws Exception {
+        Reader reader = null;
+/*        try {
+            String[] params = {"-i", escape(testDataFile.getAbsolutePath()),
+                    "-o", escape(tempDir.getAbsolutePath()),
+                    "-numConsumers", "10",
+                    "-J", //recursive Json
+                    "-t" //plain text in content
+            };
+            TikaCLI.main(params);
+            reader = new InputStreamReader(
+                    new FileInputStream(new File(tempDir, 
"test_recursive_embedded.docx.json")), IOUtils.UTF_8);
+            List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
+            assertEquals(12, metadataList.size());
+            assertEquals("59f626e09a8c16ab6dbc2800c685f772", 
metadataList.get(0).get("X-TIKA:digest:MD5"));
+            assertEquals("22e6e91f408d018417cd452d6de3dede", 
metadataList.get(5).get("X-TIKA:digest:MD5"));
+        } finally {
+            IOUtils.closeQuietly(reader);
+        }
+*/
+        reader = null;
+        try {
+            String[] params = {"-i", escape(testDataFile.getAbsolutePath()),
+                    "-o", escape(tempDir.getAbsolutePath()),
+                    "-numConsumers", "10",
+                    "-J", //recursive Json
+                    "-t", //plain text in content
+                    "-digest", "sha512"
+            };
+            TikaCLI.main(params);
+            reader = new InputStreamReader(
+                    new FileInputStream(new File(tempDir, 
"test_recursive_embedded.docx.json")), IOUtils.UTF_8);
+            List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
+            assertEquals(12, metadataList.size());
+            assertNotNull(metadataList.get(0).get("X-TIKA:digest:SHA512"));
+            
assertTrue(metadataList.get(0).get("X-TIKA:digest:SHA512").startsWith("ee46d973ee1852c01858"));
+        } finally {
+            IOUtils.closeQuietly(reader);
+        }
+
+    }
+
     public static String escape(String path) {
         if (path.indexOf(' ') > -1) {
             return '"' + path + '"';

Modified: tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
--- tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java 
(original)
+++ tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java Sun 
Jun 28 01:57:30 2015
@@ -100,6 +100,12 @@ public class TikaCLITest {
         String[] params = {"-x", resourcePrefix + "alice.cli.test"};
         TikaCLI.main(params);
         assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("?xml 
version=\"1.0\" encoding=\"UTF-8\"?"));
+
+        params = new String[]{"-x", "--digest=SHA256", resourcePrefix + 
"alice.cli.test"};
+        TikaCLI.main(params);
+        assertTrue(outContent.toString(IOUtils.UTF_8.name())
+                .contains("<meta name=\"X-TIKA:digest:SHA256\" 
content=\"e90779adbac09c4ee"));
+
     }
 
     /**
@@ -114,6 +120,11 @@ public class TikaCLITest {
         assertTrue(outContent.toString("UTF-8").contains("html 
xmlns=\"http://www.w3.org/1999/xhtml";));
         assertTrue("Expanded <title></title> element should be present",
                 
outContent.toString(IOUtils.UTF_8.name()).contains("<title></title>"));
+
+        params = new String[]{"-h", "--digest=SHA384", resourcePrefix + 
"alice.cli.test"};
+        TikaCLI.main(params);
+        assertTrue(outContent.toString("UTF-8")
+                .contains("<meta name=\"X-TIKA:digest:SHA384\" 
content=\"c69ea023f5da95a026"));
     }
 
     /**
@@ -137,6 +148,12 @@ public class TikaCLITest {
         String[] params = {"-m", resourcePrefix + "alice.cli.test"};
         TikaCLI.main(params);
         
assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("text/plain"));
+
+        params = new String[]{"-m", "--digest=SHA512", resourcePrefix + 
"alice.cli.test"};
+        TikaCLI.main(params);
+        
assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("text/plain"));
+        assertTrue(outContent.toString(IOUtils.UTF_8.name())
+                .contains("X-TIKA:digest:SHA512: 
dd459d99bc19ff78fd31fbae46e0"));
     }
 
     /**
@@ -146,7 +163,7 @@ public class TikaCLITest {
      */
     @Test
     public void testJsonMetadataOutput() throws Exception {
-        String[] params = {"--json", resourcePrefix + 
"testJsonMultipleInts.html"};
+        String[] params = {"--json", "--digest=MD2", resourcePrefix + 
"testJsonMultipleInts.html"};
         TikaCLI.main(params);
         String json = outContent.toString(IOUtils.UTF_8.name());
         //TIKA-1310
@@ -158,6 +175,7 @@ public class TikaCLITest {
         int title = json.indexOf("\"title\"");
         assertTrue(enc > -1 && fb > -1 && enc < fb);
         assertTrue (fb > -1 && title > -1 && fb < title);
+        
assertTrue(json.contains("\"X-TIKA:digest:MD2\":\"470481522c33aa7f6558dfc5cc0c8135\""));
     }
 
     /**
@@ -378,4 +396,14 @@ public class TikaCLITest {
         assertTrue(content.contains("\\n\\nembed_4\\n"));
         assertTrue(content.contains("\\n\\nembed_0"));
     }
+
+    @Test
+    public void testDigestInJson() throws Exception {
+        String[] params = new String[]{"-J", "-r", "-t", "--digest=MD5", 
resourcePrefix+"test_recursive_embedded.docx"};
+        TikaCLI.main(params);
+        String content = outContent.toString(IOUtils.UTF_8.name());
+        assertTrue(content.contains("\"X-TIKA:digest:MD5\": 
\"59f626e09a8c16ab6dbc2800c685f772\","));
+        assertTrue(content.contains("\"X-TIKA:digest:MD5\": 
\"f9627095ef86c482e61d99f0cc1cf87d\""));
+    }
+
 }

Modified: 
tika/trunk/tika-app/src/test/resources/log4j_batch_process_test.properties
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/resources/log4j_batch_process_test.properties?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
--- tika/trunk/tika-app/src/test/resources/log4j_batch_process_test.properties 
(original)
+++ tika/trunk/tika-app/src/test/resources/log4j_batch_process_test.properties 
Sun Jun 28 01:57:30 2015
@@ -14,7 +14,7 @@
 # limitations under the License.
 
 #info,debug, error,fatal ...
-log4j.rootLogger=info,stdout
+log4j.rootLogger=trace,stdout
 
 #console
 log4j.appender.stdout=org.apache.log4j.ConsoleAppender

Modified: 
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/AutoDetectParserFactory.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/AutoDetectParserFactory.java?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
--- 
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/AutoDetectParserFactory.java
 (original)
+++ 
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/AutoDetectParserFactory.java
 Sun Jun 28 01:57:30 2015
@@ -24,11 +24,12 @@ import org.apache.tika.parser.Parser;
 /**
  * Simple class for AutoDetectParser
  */
-public class AutoDetectParserFactory implements ParserFactory {
+public class AutoDetectParserFactory extends ParserFactory {
 
   @Override
   public Parser getParser(TikaConfig config) {
     return new AutoDetectParser(config);
   }
-  
+
+
 }

Modified: 
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/ParserFactory.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/ParserFactory.java?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
--- 
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/ParserFactory.java 
(original)
+++ 
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/ParserFactory.java 
Sun Jun 28 01:57:30 2015
@@ -20,8 +20,18 @@ package org.apache.tika.batch;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.parser.Parser;
 
-public interface ParserFactory {
-  
-  public Parser getParser(TikaConfig config);
+public abstract class ParserFactory {
+
+    private boolean parseRecursively = true;
+
+    public abstract Parser getParser(TikaConfig config);
+
+    public boolean getParseRecursively() {
+        return parseRecursively;
+    }
+
+    public void setParseRecursively(boolean parseRecursively) {
+        this.parseRecursively = parseRecursively;
+    }
 
 }

Added: 
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/IParserFactoryBuilder.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/IParserFactoryBuilder.java?rev=1687981&view=auto
==============================================================================
--- 
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/IParserFactoryBuilder.java
 (added)
+++ 
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/IParserFactoryBuilder.java
 Sun Jun 28 01:57:30 2015
@@ -0,0 +1,12 @@
+package org.apache.tika.batch.builders;
+
+
+import java.util.Map;
+
+import org.apache.tika.batch.ParserFactory;
+import org.w3c.dom.Node;
+
+public interface IParserFactoryBuilder {
+
+    public ParserFactory build(Node node, Map<String, String> runtimeAttrs);
+}

Added: 
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/ParserFactoryBuilder.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/ParserFactoryBuilder.java?rev=1687981&view=auto
==============================================================================
--- 
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/ParserFactoryBuilder.java
 (added)
+++ 
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/ParserFactoryBuilder.java
 Sun Jun 28 01:57:30 2015
@@ -0,0 +1,33 @@
+package org.apache.tika.batch.builders;
+
+import java.util.Locale;
+import java.util.Map;
+
+import org.apache.tika.batch.ParserFactory;
+import org.apache.tika.util.ClassLoaderUtil;
+import org.apache.tika.util.XMLDOMUtil;
+import org.w3c.dom.Node;
+
+public class ParserFactoryBuilder implements IParserFactoryBuilder {
+
+
+    @Override
+    public ParserFactory build(Node node, Map<String, String> runtimeAttrs) {
+        Map<String, String> localAttrs = XMLDOMUtil.mapifyAttrs(node, 
runtimeAttrs);
+        String className = localAttrs.get("class");
+        ParserFactory pf = ClassLoaderUtil.buildClass(ParserFactory.class, 
className);
+
+        if (localAttrs.containsKey("parseRecursively")) {
+            String bString = 
localAttrs.get("parseRecursively").toLowerCase(Locale.ENGLISH);
+            if (bString.equals("true")) {
+                pf.setParseRecursively(true);
+            } else if (bString.equals("false")) {
+                pf.setParseRecursively(false);
+            } else {
+                throw new RuntimeException("parseRecursively must have value 
of \"true\" or \"false\": "+
+                bString);
+            }
+        }
+        return pf;
+    }
+}

Modified: 
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/FSBatchProcessCLI.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/FSBatchProcessCLI.java?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
--- 
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/FSBatchProcessCLI.java
 (original)
+++ 
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/FSBatchProcessCLI.java
 Sun Jun 28 01:57:30 2015
@@ -19,6 +19,7 @@ package org.apache.tika.batch.fs;
 
 import java.io.File;
 import java.io.IOException;
+import java.net.URL;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.concurrent.ExecutorService;
@@ -74,10 +75,17 @@ public class FSBatchProcessCLI {
             is = TikaInputStream.get(batchConfigFile);
         } else {
             if (logDefault) {
-                logger.info("No config file set via -bc, relying on 
default-tika-batch-config.xml");
+                logger.info("No config file set via -bc, relying on 
tika-app-batch-config.xml or default-tika-batch-config.xml");
+            }
+            //test to see if there's a tika-app-batch-config.xml on the path
+            URL config = 
FSBatchProcessCLI.class.getResource("/tika-app-batch-config.xml");
+            if (config != null) {
+                is = TikaInputStream.get(
+                        
FSBatchProcessCLI.class.getResourceAsStream("/tika-app-batch-config.xml"));
+            } else {
+                is = TikaInputStream.get(
+                        
FSBatchProcessCLI.class.getResourceAsStream("default-tika-batch-config.xml"));
             }
-            is = TikaInputStream.get(
-                    
FSBatchProcessCLI.class.getResourceAsStream("default-tika-batch-config.xml"));
         }
         return is;
     }

Modified: 
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
--- 
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java
 (original)
+++ 
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java
 Sun Jun 28 01:57:30 2015
@@ -31,6 +31,7 @@ import org.apache.tika.batch.ParserFacto
 import org.apache.tika.batch.builders.AbstractConsumersBuilder;
 import org.apache.tika.batch.builders.BatchProcessBuilder;
 import org.apache.tika.batch.builders.IContentHandlerFactoryBuilder;
+import org.apache.tika.batch.builders.IParserFactoryBuilder;
 import org.apache.tika.batch.fs.BasicTikaFSConsumer;
 import org.apache.tika.batch.fs.FSConsumersManager;
 import org.apache.tika.batch.fs.FSOutputStreamFactory;
@@ -156,10 +157,10 @@ public class BasicTikaFSConsumersBuilder
     }
 
     private ParserFactory getParserFactory(Node node, Map<String, String> 
runtimeAttributes) {
-        //TODO: add ability to set TikaConfig file path
         Map<String, String> localAttrs = XMLDOMUtil.mapifyAttrs(node, 
runtimeAttributes);
-        String className = localAttrs.get("class");
-        return ClassLoaderUtil.buildClass(ParserFactory.class, className);
+        String className = localAttrs.get("builderClass");
+        IParserFactoryBuilder builder = 
ClassLoaderUtil.buildClass(IParserFactoryBuilder.class, className);
+        return builder.build(node, runtimeAttributes);
     }
 
     private OutputStreamFactory getOutputStreamFactory(Node node, Map<String, 
String> runtimeAttributes) {

Modified: 
tika/trunk/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
--- 
tika/trunk/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml
 (original)
+++ 
tika/trunk/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml
 Sun Jun 28 01:57:30 2015
@@ -103,15 +103,20 @@
              excludeFilePat="(?i).msg$"
              maxFileSizeBytes="-1"
              inputDir="input"
-    />
--->
-    <consumers 
builderClass="org.apache.tika.batch.fs.builders.BasicTikaFSConsumersBuilder"
-               recursiveParserWrapper="false" 
consumersManagerMaxMillis="60000">
-        <parser class="org.apache.tika.batch.AutoDetectParserFactory" 
parseRecursively="true"/>
-        <contenthandler 
builderClass="org.apache.tika.batch.builders.DefaultContentHandlerFactoryBuilder"
+    />
+-->
+    <!--
+        To wrap parser in RecursiveParserWrapper (tika-app's -J or 
tika-server's /rmeta),
+        add attribute recursiveParserWrapper="true" to consumers element.
+        -->
+    <consumers 
builderClass="org.apache.tika.batch.fs.builders.BasicTikaFSConsumersBuilder"
+               recursiveParserWrapper="false" 
consumersManagerMaxMillis="60000">
+        <parser 
builderClass="org.apache.tika.batch.builders.ParserFactoryBuilder"
+                class="org.apache.tika.batch.AutoDetectParserFactory"
+                parseRecursively="true"/>
+        <contenthandler 
builderClass="org.apache.tika.batch.builders.DefaultContentHandlerFactoryBuilder"
                         basicHandlerType="xml" writeLimit="-1"/>
-        <!-- overwritePolicy: "skip" a file if output file exists, "rename" a 
output file, "overwrite" -->
-        <!-- can include e.g. outputDir="output", but we don't want to include 
this in the default! -->
+        <!-- overwritePolicy: "skip" a file if output file exists, "rename" a 
output file, "overwrite" -->        <!-- can include e.g. outputDir="output", 
but we don't want to include this in the default! -->
         <outputstream class="FSOutputStreamFactory" encoding="UTF-8" 
outputSuffix="xml"/>
     </consumers>
 

Modified: 
tika/trunk/tika-batch/src/test/java/org/apache/tika/parser/mock/MockParserFactory.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/java/org/apache/tika/parser/mock/MockParserFactory.java?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
--- 
tika/trunk/tika-batch/src/test/java/org/apache/tika/parser/mock/MockParserFactory.java
 (original)
+++ 
tika/trunk/tika-batch/src/test/java/org/apache/tika/parser/mock/MockParserFactory.java
 Sun Jun 28 01:57:30 2015
@@ -21,9 +21,11 @@ import org.apache.tika.batch.ParserFacto
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.parser.Parser;
 
-public class MockParserFactory implements ParserFactory {
+public class MockParserFactory extends ParserFactory {
+
     @Override
     public Parser getParser(TikaConfig config) {
         return new MockParser();
     }
+
 }

Modified: 
tika/trunk/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
--- 
tika/trunk/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml
 (original)
+++ 
tika/trunk/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml
 Sun Jun 28 01:57:30 2015
@@ -96,11 +96,12 @@
 
        <consumers 
builderClass="org.apache.tika.batch.mock.MockConsumersBuilder"
                recursiveParserWrapper="false" consumersManagerMaxMillis="1000">
-               <parser class="org.apache.tika.parser.mock.MockParserFactory" 
parseRecursively="true"/>
+        <parser 
builderClass="org.apache.tika.batch.builders.ParserFactoryBuilder"
+                class="org.apache.tika.parser.mock.MockParserFactory"
+                parseRecursively="true"/>
                <contenthandler 
builderClass="org.apache.tika.batch.builders.DefaultContentHandlerFactoryBuilder"
                         basicHandlerType="xml" writeLimit="-1"/>
 
-
                <outputstream class="FSOutputStreamFactory"
                 encoding="UTF-8" outputSuffix="xml"/>
        </consumers>

Modified: tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
--- tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml 
(original)
+++ tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml Sun 
Jun 28 01:57:30 2015
@@ -89,11 +89,13 @@
 
        <consumers 
builderClass="org.apache.tika.batch.fs.builders.BasicTikaFSConsumersBuilder"
                recursiveParserWrapper="false">
-               <parser class="org.apache.tika.parser.mock.MockParserFactory" 
parseRecursively="true"/>
+        <parser 
builderClass="org.apache.tika.batch.builders.ParserFactoryBuilder"
+            class="org.apache.tika.batch.AutoDetectParserFactory"
+            parseRecursively="true"/>
+        
                <contenthandler 
builderClass="org.apache.tika.batch.builders.DefaultContentHandlerFactoryBuilder"
                         basicHandlerType="xml" writeLimit="-1"/>
 
-
                <outputstream class="FSOutputStreamFactory"
                 encoding="UTF-8" outputSuffix="xml"/>
        </consumers>

Modified: tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
--- tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml 
(original)
+++ tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml Sun Jun 
28 01:57:30 2015
@@ -95,11 +95,12 @@
 
        <consumers 
builderClass="org.apache.tika.batch.fs.builders.BasicTikaFSConsumersBuilder"
                recursiveParserWrapper="false" 
consumersManagerMaxMillis="120000">
-               <parser class="org.apache.tika.parser.mock.MockParserFactory" 
parseRecursively="true"/>
+        <parser 
builderClass="org.apache.tika.batch.builders.ParserFactoryBuilder"
+                class="org.apache.tika.parser.mock.MockParserFactory"
+                parseRecursively="true"/>
                <contenthandler 
builderClass="org.apache.tika.batch.builders.DefaultContentHandlerFactoryBuilder"
                         basicHandlerType="xml" writeLimit="-1"/>
 
-
                <outputstream class="FSOutputStreamFactory"
                 encoding="UTF-8" outputSuffix="xml"/>
        </consumers>

Added: 
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java?rev=1687981&view=auto
==============================================================================
--- 
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java 
(added)
+++ 
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java 
Sun Jun 28 01:57:30 2015
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser;
+
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class DigestingParser extends ParserDecorator {
+
+    /**
+     * Interface for optional digester, if specified during construction.
+     * See org.apache.parser.utils.CommonsDigester in tika-parsers for an
+     * implementation.
+     */
+    public interface Digester {
+        /**
+         * Digests an InputStream and sets the appropriate value(s) in the 
metadata.
+         * The Digester is also responsible for marking and resetting the 
stream.
+         * <p>
+         * The given stream is guaranteed to support the
+         * {@link InputStream#markSupported() mark feature} and the detector
+         * is expected to {@link InputStream#mark(int) mark} the stream before
+         * reading any bytes from it, and to {@link InputStream#reset() reset}
+         * the stream before returning. The stream must not be closed by the
+         * detector.
+         *
+         * @param is InputStream to digest
+         * @param m Metadata to set the values for
+         * @param parseContext ParseContext
+         * @throws IOException
+         */
+        void digest(InputStream is, Metadata m, ParseContext parseContext) 
throws IOException;
+
+
+    };
+
+    private final Digester digester;
+    /**
+     * Creates a decorator for the given parser.
+     *
+     * @param parser the parser instance to be decorated
+     */
+    public DigestingParser(Parser parser, Digester digester) {
+        super(parser);
+        this.digester = digester;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata, ParseContext context) throws IOException, SAXException, TikaException 
{
+        if (digester != null) {
+            digester.digest(stream, metadata, context);
+        }
+        super.parse(stream, handler, metadata, context);
+    }
+}

Added: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/utils/CommonsDigester.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/utils/CommonsDigester.java?rev=1687981&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/utils/CommonsDigester.java
 (added)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/utils/CommonsDigester.java
 Sun Jun 28 01:57:30 2015
@@ -0,0 +1,299 @@
+package org.apache.tika.parser.utils;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Locale;
+
+import org.apache.commons.codec.digest.DigestUtils;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.DigestingParser;
+import org.apache.tika.parser.ParseContext;
+
+/**
+ * Implementation of {@link org.apache.tika.parser.DigestingParser.Digester}
+ * that relies on commons.codec.digest.DigestUtils to calculate digest hashes.
+ * <p>
+ * This digester tries to use the regular mark/reset protocol on the 
InputStream.
+ * However, this wraps an internal BoundedInputStream, and if the InputStream
+ * is not fully read, then this will reset the stream and
+ * spool the InputStream to disk (via TikaInputStream) and then digest the 
file.
+ * <p>
+ * If a TikaInputStream is passed in and it has an underlying file that is 
longer
+ * than the {@link #markLimit}, then this digester digests the file directly.
+ *
+ */
+public class CommonsDigester implements DigestingParser.Digester {
+
+    public enum DigestAlgorithm {
+        //those currently available in commons.digest
+        MD2,
+        MD5,
+        SHA1,
+        SHA256,
+        SHA384,
+        SHA512;
+
+        String getMetadataKey() {
+            return TikaCoreProperties.TIKA_META_PREFIX+
+                    
"digest"+Metadata.NAMESPACE_PREFIX_DELIMITER+this.toString();
+        }
+    }
+
+    private final List<DigestAlgorithm> algorithms = new 
ArrayList<DigestAlgorithm>();
+    private final int markLimit;
+
+    public CommonsDigester(int markLimit, DigestAlgorithm... algorithms) {
+        Collections.addAll(this.algorithms, algorithms);
+        if (markLimit < 0) {
+            throw new IllegalArgumentException("markLimit must be >= 0");
+        }
+        this.markLimit = markLimit;
+    }
+
+    @Override
+    public void digest(InputStream is, Metadata m, ParseContext parseContext) 
throws IOException {
+        InputStream tis = TikaInputStream.get(is);
+        long sz = -1;
+        if (((TikaInputStream)tis).hasFile()) {
+            sz = ((TikaInputStream)tis).getLength();
+        }
+        //if the file is definitely a file,
+        //and its size is greater than its mark limit,
+        //just digest the underlying file.
+        if (sz > markLimit) {
+            digestFile(((TikaInputStream)tis).getFile(), m);
+            return;
+        }
+
+        //try the usual mark/reset stuff.
+        //however, if you actually hit the bound,
+        //then stop and spool to file via TikaInputStream
+        SimpleBoundedInputStream bis = new SimpleBoundedInputStream(markLimit, 
tis);
+        boolean finishedStream = false;
+        for (DigestAlgorithm algorithm : algorithms) {
+            bis.mark(markLimit + 1);
+            finishedStream = digestEach(algorithm, bis, m);
+            bis.reset();
+            if (!finishedStream) {
+                break;
+            }
+        }
+        if (!finishedStream) {
+            digestFile(((TikaInputStream)tis).getFile(), m);
+        }
+    }
+
+    private void digestFile(File f, Metadata m) throws IOException {
+        for (DigestAlgorithm algorithm : algorithms) {
+            InputStream is = new FileInputStream(f);
+            try {
+                digestEach(algorithm, is, m);
+            } finally {
+                IOUtils.closeQuietly(is);
+            }
+        }
+    }
+
+    /**
+     *
+     * @param algorithm algo to use
+     * @param is input stream to read from
+     * @param metadata metadata for reporting the digest
+     * @return whether or not this finished the input stream
+     * @throws IOException
+     */
+    private boolean digestEach(DigestAlgorithm algorithm,
+                            InputStream is, Metadata metadata) throws 
IOException {
+        String digest = null;
+        try {
+            switch (algorithm) {
+                case MD2:
+                    digest = DigestUtils.md2Hex(is);
+                    break;
+                case MD5:
+                    digest = DigestUtils.md5Hex(is);
+                    break;
+                case SHA1:
+                    digest = DigestUtils.sha1Hex(is);
+                    break;
+                case SHA256:
+                    digest = DigestUtils.sha256Hex(is);
+                    break;
+                case SHA384:
+                    digest = DigestUtils.sha384Hex(is);
+                    break;
+                case SHA512:
+                    digest = DigestUtils.sha512Hex(is);
+                    break;
+                default:
+                    throw new IllegalArgumentException("Sorry, not aware of 
algorithm: " + algorithm.toString());
+            }
+        } catch (IOException e) {
+            e.printStackTrace();
+            //swallow, or should we throw this?
+        }
+        if (is instanceof SimpleBoundedInputStream) {
+            if (((SimpleBoundedInputStream)is).hasHitBound()) {
+                return false;
+            }
+        }
+        metadata.set(algorithm.getMetadataKey(), digest);
+        return true;
+    }
+
+    /**
+     *
+     * @param s comma-delimited (no space) list of algorithms to use: 
md5,sha256
+     * @return
+     */
+    public static DigestAlgorithm[] parse(String s) {
+        assert(s != null);
+
+        List<DigestAlgorithm> ret = new ArrayList<DigestAlgorithm>();
+        for (String algoString : s.split(",")) {
+            String uc = algoString.toUpperCase(Locale.ROOT);
+            if (uc.equals(DigestAlgorithm.MD2.toString())) {
+                ret.add(DigestAlgorithm.MD2);
+            } else if (uc.equals(DigestAlgorithm.MD5.toString())) {
+                ret.add(DigestAlgorithm.MD5);
+            } else if (uc.equals(DigestAlgorithm.SHA1.toString())) {
+                ret.add(DigestAlgorithm.SHA1);
+            } else if (uc.equals(DigestAlgorithm.SHA256.toString())) {
+                ret.add(DigestAlgorithm.SHA256);
+            } else if (uc.equals(DigestAlgorithm.SHA384.toString())) {
+                ret.add(DigestAlgorithm.SHA384);
+            } else if (uc.equals(DigestAlgorithm.SHA512.toString())) {
+                ret.add(DigestAlgorithm.SHA512);
+            } else {
+                StringBuilder sb = new StringBuilder();
+                int i = 0;
+                for (DigestAlgorithm algo : DigestAlgorithm.values()) {
+                    if (i++ > 0) {
+                        sb.append(", ");
+                    }
+                    sb.append(algo.toString());
+                }
+                throw new IllegalArgumentException("Couldn't match " + s + " 
with any of: " + sb.toString());
+            }
+        }
+        return ret.toArray(new DigestAlgorithm[ret.size()]);
+    }
+
+    /**
+     * Very slight modification of Commons' BoundedInputStream
+     * so that we can figure out if this hit the bound or not.
+     */
+    private class SimpleBoundedInputStream extends InputStream {
+        private final static int EOF = -1;
+        private final long max;
+        private final InputStream in;
+        private long pos;
+        boolean hitBound = false;
+
+        private SimpleBoundedInputStream(long max, InputStream in) {
+            this.max = max;
+            this.in = in;
+        }
+
+        @Override
+        public int read() throws IOException {
+            if (max >= 0 && pos >= max) {
+                hitBound = true;
+                return EOF;
+            }
+            final int result = in.read();
+            pos++;
+            return result;
+        }
+
+        /**
+         * Invokes the delegate's <code>read(byte[])</code> method.
+         * @param b the buffer to read the bytes into
+         * @return the number of bytes read or -1 if the end of stream or
+         * the limit has been reached.
+         * @throws IOException if an I/O error occurs
+         */
+        @Override
+        public int read(final byte[] b) throws IOException {
+            return this.read(b, 0, b.length);
+        }
+
+        /**
+         * Invokes the delegate's <code>read(byte[], int, int)</code> method.
+         * @param b the buffer to read the bytes into
+         * @param off The start offset
+         * @param len The number of bytes to read
+         * @return the number of bytes read or -1 if the end of stream or
+         * the limit has been reached.
+         * @throws IOException if an I/O error occurs
+         */
+        @Override
+        public int read(final byte[] b, final int off, final int len) throws 
IOException {
+            if (max>=0 && pos>=max) {
+                return EOF;
+            }
+            final long maxRead = max>=0 ? Math.min(len, max-pos) : len;
+            final int bytesRead = in.read(b, off, (int)maxRead);
+
+            if (bytesRead==EOF) {
+                return EOF;
+            }
+
+            pos+=bytesRead;
+            return bytesRead;
+        }
+
+        /**
+         * Invokes the delegate's <code>skip(long)</code> method.
+         * @param n the number of bytes to skip
+         * @return the actual number of bytes skipped
+         * @throws IOException if an I/O error occurs
+         */
+        @Override
+        public long skip(final long n) throws IOException {
+            final long toSkip = max>=0 ? Math.min(n, max-pos) : n;
+            final long skippedBytes = in.skip(toSkip);
+            pos+=skippedBytes;
+            return skippedBytes;
+        }
+
+        @Override
+        public void reset() throws IOException {
+            in.reset();
+        }
+
+        @Override
+        public void mark(int readLimit) {
+            in.mark(readLimit);
+        }
+
+        public boolean hasHitBound() {
+            return hitBound;
+        }
+    }
+}

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java 
(original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java Sun Jun 
28 01:57:30 2015
@@ -106,6 +106,10 @@ public abstract class TikaTest {
         }
     }
 
+    protected XMLResult getXML(String filePath, Parser parser, Metadata 
metadata) throws Exception {
+        return getXML(getResourceAsStream("/test-documents/" + filePath), 
parser, metadata);
+    }
+
     protected XMLResult getXML(String filePath, Metadata metadata) throws 
Exception {
         return getXML(getResourceAsStream("/test-documents/" + filePath), new 
AutoDetectParser(), metadata);
     }

Added: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java?rev=1687981&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java
 (added)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java
 Sun Jun 28 01:57:30 2015
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import static junit.framework.TestCase.assertEquals;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.utils.CommonsDigester;
+import org.junit.Test;
+
+
+public class DigestingParserTest extends TikaTest {
+
+    private final static String P = TikaCoreProperties.TIKA_META_PREFIX+
+            "digest"+Metadata.NAMESPACE_PREFIX_DELIMITER;
+
+    private final int UNLIMITED = 1000000;//well, not really, but longer than 
input file
+    private final Parser p = new AutoDetectParser();
+
+    @Test
+    public void testBasic() throws Exception {
+        Map<CommonsDigester.DigestAlgorithm, String> expected =
+                new HashMap<CommonsDigester.DigestAlgorithm, String>();
+
+        
expected.put(CommonsDigester.DigestAlgorithm.MD2,"d768c8e27b0b52c6eaabfaa7122d1d4f");
+        
expected.put(CommonsDigester.DigestAlgorithm.MD5,"59f626e09a8c16ab6dbc2800c685f772");
+        
expected.put(CommonsDigester.DigestAlgorithm.SHA1,"7a1f001d163ac90d8ea54c050faf5a38079788a6");
+        
expected.put(CommonsDigester.DigestAlgorithm.SHA256,"c4b7fab030a8b6a9d6691f6699ac8e6f"
 +
+                                                            
"82bc53764a0f1430d134ae3b70c32654");
+        
expected.put(CommonsDigester.DigestAlgorithm.SHA384,"ebe368b9326fef44408290724d187553"+
+                                                            
"8b8a6923fdf251ddab72c6e4b5d54160" +
+                                                            
"9db917ba4260d1767995a844d8d654df");
+        
expected.put(CommonsDigester.DigestAlgorithm.SHA512,"ee46d973ee1852c018580c242955974d"+
+                                                            
"da4c21f36b54d7acd06fcf68e974663b"+
+                                                            
"fed1d256875be58d22beacf178154cc3"+
+                                                            
"a1178cb73443deaa53aa0840324708bb");
+
+        //test each one
+        for (CommonsDigester.DigestAlgorithm algo : 
CommonsDigester.DigestAlgorithm.values()) {
+            Metadata m = new Metadata();
+            XMLResult xml = getXML("test_recursive_embedded.docx",
+                    new DigestingParser(p, new CommonsDigester(UNLIMITED, 
algo)), m);
+            assertEquals(algo.toString(), expected.get(algo), m.get(P + 
algo.toString()));
+        }
+
+
+        //test comma separated
+        CommonsDigester.DigestAlgorithm[] algos = 
CommonsDigester.parse("md5,sha256,sha384,sha512");
+        Metadata m = new Metadata();
+        XMLResult xml = getXML("test_recursive_embedded.docx",
+                new DigestingParser(p, new CommonsDigester(UNLIMITED, algos)), 
m);
+        for (CommonsDigester.DigestAlgorithm algo : new 
CommonsDigester.DigestAlgorithm[]{
+                CommonsDigester.DigestAlgorithm.MD5,
+                CommonsDigester.DigestAlgorithm.SHA256,
+                CommonsDigester.DigestAlgorithm.SHA384,
+                CommonsDigester.DigestAlgorithm.SHA512}) {
+            assertEquals(algo.toString(), expected.get(algo), m.get(P + 
algo.toString()));
+        }
+
+        assertNull(m.get(P+CommonsDigester.DigestAlgorithm.MD2.toString()));
+        assertNull(m.get(P+CommonsDigester.DigestAlgorithm.SHA1.toString()));
+
+    }
+
+    @Test
+    public void testLimitedRead() throws Exception {
+        CommonsDigester.DigestAlgorithm algo = 
CommonsDigester.DigestAlgorithm.MD5;
+        int limit = 100;
+        byte[] bytes = new byte[limit];
+        InputStream is = 
getResourceAsStream("/test-documents/test_recursive_embedded.docx");
+        is.read(bytes, 0, limit);
+        is.close();
+        Metadata m = new Metadata();
+        try {
+            XMLResult xml = getXML(TikaInputStream.get(bytes),
+                    new DigestingParser(p, new CommonsDigester(100, algo)), m);
+        } catch (TikaException e) {
+            //thrown because this is just a file fragment
+            assertContains("Unexpected RuntimeException from 
org.apache.tika.parser.microsoft.ooxml.OOXMLParser",
+                    e.getMessage());
+        }
+        String expectedMD5 = m.get(P+"MD5");
+
+        m = new Metadata();
+        XMLResult xml = getXML("test_recursive_embedded.docx",
+                new DigestingParser(p, new CommonsDigester(100, algo)), m);
+        assertEquals(expectedMD5, m.get(P+"MD5"));
+    }
+
+    @Test
+    public void testReset() throws Exception {
+        String expectedMD5 = "1643c2cef21e36720c54f4f6cb3349d0";
+        Metadata m = new Metadata();
+        XMLResult xml = getXML("test_recursive_embedded.docx",
+                new DigestingParser(p, new CommonsDigester(100, 
CommonsDigester.DigestAlgorithm.MD5)), m);
+        assertEquals(expectedMD5, m.get(P+"MD5"));
+    }
+
+    @Test
+    public void testNegativeMaxMarkLength() throws Exception {
+        Metadata m = new Metadata();
+        boolean ex = false;
+        try {
+            XMLResult xml = getXML("test_recursive_embedded.docx",
+                    new DigestingParser(p, new CommonsDigester(-1, 
CommonsDigester.DigestAlgorithm.MD5)), m);
+        } catch (IllegalArgumentException e) {
+            ex = true;
+        }
+        assertTrue("Exception not thrown", ex);
+    }
+
+}

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
 Sun Jun 28 01:57:30 2015
@@ -30,8 +30,10 @@ import java.util.Set;
 
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.IOUtils;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.parser.utils.CommonsDigester;
 import org.apache.tika.sax.BasicContentHandlerFactory;
 import org.apache.tika.sax.ContentHandlerFactory;
 import org.junit.Test;
@@ -210,7 +212,7 @@ public class RecursiveParserWrapperTest
         metadata.set(Metadata.RESOURCE_NAME_KEY, 
"test_recursive_embedded_npe.docx");
         list = getMetadata(metadata,
                 new 
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1),
-                false);
+                false, null);
 
         //Composite parser swallows caught TikaExceptions, IOExceptions and 
SAXExceptions
         //and just doesn't bother to report that there was an exception.
@@ -260,10 +262,30 @@ public class RecursiveParserWrapperTest
         assertEquals("embeddedAuthor", embeddedMetadata.get("author"));
     }
 
+    @Test
+    public void testDigesters() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.RESOURCE_NAME_KEY, 
"test_recursive_embedded.docx");
+        List<Metadata> list = getMetadata(metadata,
+                new 
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1),
+                true, new CommonsDigester(100000, 
CommonsDigester.DigestAlgorithm.MD5));
+        int i = 0;
+        Metadata m0 = list.get(0);
+        Metadata m6 = list.get(6);
+        String md5Key = "X-TIKA:digest:MD5";
+        assertEquals("59f626e09a8c16ab6dbc2800c685f772", 
list.get(0).get(md5Key));
+        assertEquals("ccdf3882e7e4c2454e28884db9b0a54d", 
list.get(6).get(md5Key));
+        assertEquals("a869bf6432ebd14e19fc79416274e0c9", 
list.get(7).get(md5Key));
+    }
+
     private List<Metadata> getMetadata(Metadata metadata, 
ContentHandlerFactory contentHandlerFactory,
-                                       boolean catchEmbeddedExceptions) throws 
Exception {
+                                       boolean catchEmbeddedExceptions,
+                                       DigestingParser.Digester digester) 
throws Exception {
         ParseContext context = new ParseContext();
         Parser wrapped = new AutoDetectParser();
+        if (digester != null) {
+            wrapped = new DigestingParser(wrapped, digester);
+        }
         RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped,
                 contentHandlerFactory, catchEmbeddedExceptions);
         String path = metadata.get(Metadata.RESOURCE_NAME_KEY);
@@ -274,8 +296,7 @@ public class RecursiveParserWrapperTest
         }
         InputStream stream = null;
         try {
-            stream = RecursiveParserWrapperTest.class.getResourceAsStream(
-                    path);
+            stream = 
TikaInputStream.get(RecursiveParserWrapperTest.class.getResource(path).toURI());
             wrapper.parse(stream, new DefaultHandler(), metadata, context);
         } finally {
             IOUtils.closeQuietly(stream);
@@ -286,6 +307,6 @@ public class RecursiveParserWrapperTest
 
     private List<Metadata> getMetadata(Metadata metadata, 
ContentHandlerFactory contentHandlerFactory)
             throws Exception {
-        return getMetadata(metadata, contentHandlerFactory, true);
+        return getMetadata(metadata, contentHandlerFactory, true, null);
     }
 }

Modified: 
tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
--- 
tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java 
(original)
+++ 
tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java 
Sun Jun 28 01:57:30 2015
@@ -38,11 +38,12 @@ import org.apache.cxf.jaxrs.lifecycle.Si
 import org.apache.cxf.rs.security.cors.CrossOriginResourceSharingFilter;
 import org.apache.tika.Tika;
 import org.apache.tika.config.TikaConfig;
+import org.apache.tika.parser.DigestingParser;
+import org.apache.tika.parser.utils.CommonsDigester;
 import org.apache.tika.server.resource.DetectorResource;
+import org.apache.tika.server.resource.LanguageResource;
 import org.apache.tika.server.resource.MetadataResource;
 import org.apache.tika.server.resource.RecursiveMetadataResource;
-import org.apache.tika.server.writer.TarWriter;
-import org.apache.tika.server.resource.LanguageResource;
 import org.apache.tika.server.resource.TikaDetectors;
 import org.apache.tika.server.resource.TikaMimeTypes;
 import org.apache.tika.server.resource.TikaParsers;
@@ -54,12 +55,14 @@ import org.apache.tika.server.resource.U
 import org.apache.tika.server.writer.CSVMessageBodyWriter;
 import org.apache.tika.server.writer.JSONMessageBodyWriter;
 import org.apache.tika.server.writer.MetadataListMessageBodyWriter;
+import org.apache.tika.server.writer.TarWriter;
 import org.apache.tika.server.writer.TextMessageBodyWriter;
 import org.apache.tika.server.writer.XMPMessageBodyWriter;
 import org.apache.tika.server.writer.ZipWriter;
 
 public class TikaServerCli {
     public static final int DEFAULT_PORT = 9998;
+    private static final int DEFAULT_DIGEST_MARK_LIMIT = 20*1024*1024;
     public static final String DEFAULT_HOST = "localhost";
     public static final Set<String> LOG_LEVELS =
             new HashSet<String>(Arrays.asList("debug", "info"));
@@ -71,6 +74,8 @@ public class TikaServerCli {
         options.addOption("h", "host", true, "host name (default = " + 
DEFAULT_HOST + ')');
         options.addOption("p", "port", true, "listen port (default = " + 
DEFAULT_PORT + ')');
         options.addOption("c", "config", true, "Tika Configuration file to 
override default config with.");
+        options.addOption("d", "digest", true, "include digest in metadata, 
e.g. md5,sha256");
+        options.addOption("dml", "digestMarkLimit", true, "max number of bytes 
to mark on stream for digest");
         options.addOption("l", "log", true, "request URI log level ('debug' or 
'info')");
         options.addOption("s", "includeStack", false, "whether or not to 
return a stack trace\nif there is an exception during 'parse'");
         options.addOption("?", "help", false, "this help message");
@@ -143,22 +148,39 @@ public class TikaServerCli {
               tika = TikaConfig.getDefaultConfig();
             }
 
+            DigestingParser.Digester digester = null;
+            if (line.hasOption("digest")){
+                int digestMarkLimit = DEFAULT_DIGEST_MARK_LIMIT;
+                if (line.hasOption("dml")) {
+                    String dmlS = line.getOptionValue("dml");
+                    try {
+                        digestMarkLimit = Integer.parseInt(dmlS);
+                    } catch (NumberFormatException e) {
+                        throw new RuntimeException("Must have parseable int 
after digestMarkLimit(dml): "+dmlS);
+                    }
+                }
+                digester = new CommonsDigester(digestMarkLimit,
+                        CommonsDigester.parse(line.getOptionValue("digest")));
+            }
+
+
+            TikaResource.init(tika, digester);
             JAXRSServerFactoryBean sf = new JAXRSServerFactoryBean();
 
             List<ResourceProvider> rCoreProviders = new 
ArrayList<ResourceProvider>();
-            rCoreProviders.add(new SingletonResourceProvider(new 
MetadataResource(tika)));
-            rCoreProviders.add(new SingletonResourceProvider(new 
RecursiveMetadataResource(tika)));
-            rCoreProviders.add(new SingletonResourceProvider(new 
DetectorResource(tika)));
-            rCoreProviders.add(new SingletonResourceProvider(new 
LanguageResource(tika)));
-            rCoreProviders.add(new SingletonResourceProvider(new 
TranslateResource(tika)));
-            rCoreProviders.add(new SingletonResourceProvider(new 
TikaResource(tika)));
-            rCoreProviders.add(new SingletonResourceProvider(new 
UnpackerResource(tika)));
-            rCoreProviders.add(new SingletonResourceProvider(new 
TikaMimeTypes(tika)));
-            rCoreProviders.add(new SingletonResourceProvider(new 
TikaDetectors(tika)));
-            rCoreProviders.add(new SingletonResourceProvider(new 
TikaParsers(tika)));
-            rCoreProviders.add(new SingletonResourceProvider(new 
TikaVersion(tika)));
+            rCoreProviders.add(new SingletonResourceProvider(new 
MetadataResource()));
+            rCoreProviders.add(new SingletonResourceProvider(new 
RecursiveMetadataResource()));
+            rCoreProviders.add(new SingletonResourceProvider(new 
DetectorResource()));
+            rCoreProviders.add(new SingletonResourceProvider(new 
LanguageResource()));
+            rCoreProviders.add(new SingletonResourceProvider(new 
TranslateResource()));
+            rCoreProviders.add(new SingletonResourceProvider(new 
TikaResource()));
+            rCoreProviders.add(new SingletonResourceProvider(new 
UnpackerResource()));
+            rCoreProviders.add(new SingletonResourceProvider(new 
TikaMimeTypes()));
+            rCoreProviders.add(new SingletonResourceProvider(new 
TikaDetectors()));
+            rCoreProviders.add(new SingletonResourceProvider(new 
TikaParsers()));
+            rCoreProviders.add(new SingletonResourceProvider(new 
TikaVersion()));
             List<ResourceProvider> rAllProviders = new 
ArrayList<ResourceProvider>(rCoreProviders);
-            rAllProviders.add(new SingletonResourceProvider(new 
TikaWelcome(tika, rCoreProviders)));
+            rAllProviders.add(new SingletonResourceProvider(new 
TikaWelcome(rCoreProviders)));
             sf.setResourceProviders(rAllProviders);
 
             List<Object> providers = new ArrayList<Object>();

Modified: 
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/DetectorResource.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/DetectorResource.java?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
--- 
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/DetectorResource.java
 (original)
+++ 
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/DetectorResource.java
 Sun Jun 28 01:57:30 2015
@@ -24,13 +24,11 @@ import javax.ws.rs.Produces;
 import javax.ws.rs.core.Context;
 import javax.ws.rs.core.HttpHeaders;
 import javax.ws.rs.core.UriInfo;
-
 import java.io.IOException;
 import java.io.InputStream;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
-import org.apache.tika.config.TikaConfig;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
@@ -41,12 +39,6 @@ public class DetectorResource {
     private static final Log logger = LogFactory.getLog(DetectorResource.class
             .getName());
 
-    private TikaConfig config = null;
-
-    public DetectorResource(TikaConfig config) {
-        this.config = config;
-    }
-
     @PUT
     @Path("stream")
     @Consumes("*/*")
@@ -60,7 +52,7 @@ public class DetectorResource {
         logger.info("Detecting media type for Filename: " + filename);
         met.add(Metadata.RESOURCE_NAME_KEY, filename);
         try {
-            return this.config.getDetector().detect(tis, met).toString();
+            return TikaResource.getConfig().getDetector().detect(tis, 
met).toString();
         } catch (IOException e) {
             logger.warn("Unable to detect MIME type for file. Reason: "
                     + e.getMessage());

Modified: 
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/LanguageResource.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/LanguageResource.java?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
--- 
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/LanguageResource.java
 (original)
+++ 
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/LanguageResource.java
 Sun Jun 28 01:57:30 2015
@@ -17,36 +17,27 @@
 
 package org.apache.tika.server.resource;
 
-import java.io.IOException;
-import java.io.InputStream;
-
 import javax.ws.rs.Consumes;
 import javax.ws.rs.POST;
 import javax.ws.rs.PUT;
 import javax.ws.rs.Path;
 import javax.ws.rs.Produces;
+import java.io.IOException;
+import java.io.InputStream;
 
+import com.google.common.base.Charsets;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
-import org.apache.tika.config.TikaConfig;
 import org.apache.tika.language.LanguageIdentifier;
 import org.apache.tika.language.LanguageProfile;
 
-import com.google.common.base.Charsets;
-
 @Path("/language")
 public class LanguageResource {
 
        private static final Log logger = 
LogFactory.getLog(LanguageResource.class
                        .getName());
 
-       private TikaConfig config;
-
-       public LanguageResource(TikaConfig config) {
-               this.config = config;
-       }
-
        @PUT
        @POST
        @Path("/stream")

Modified: 
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
--- 
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java
 (original)
+++ 
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java
 Sun Jun 28 01:57:30 2015
@@ -28,31 +28,22 @@ import javax.ws.rs.core.HttpHeaders;
 import javax.ws.rs.core.MultivaluedMap;
 import javax.ws.rs.core.Response;
 import javax.ws.rs.core.UriInfo;
-
 import java.io.IOException;
 import java.io.InputStream;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.cxf.jaxrs.ext.multipart.Attachment;
-import org.apache.tika.config.TikaConfig;
 import org.apache.tika.language.ProfilingHandler;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
-import org.xml.sax.helpers.DefaultHandler;
+import org.apache.tika.parser.Parser;
 
 
 @Path("/meta")
 public class MetadataResource {
     private static final Log logger = 
LogFactory.getLog(MetadataResource.class);
 
-    private TikaConfig tikaConfig;
-
-    public MetadataResource(TikaConfig tikaConfig) {
-        this.tikaConfig = tikaConfig;
-    }
-
     @POST
     @Consumes("multipart/form-data")
     @Produces({"text/csv", "application/json", "application/rdf+xml"})
@@ -127,7 +118,7 @@ public class MetadataResource {
                                    MultivaluedMap<String, String> httpHeaders, 
UriInfo info) throws IOException {
         final Metadata metadata = new Metadata();
         final ParseContext context = new ParseContext();
-        AutoDetectParser parser = TikaResource.createParser(tikaConfig);
+        Parser parser = TikaResource.createParser();
         TikaResource.fillMetadata(parser, metadata, context, httpHeaders);
         //no need to pass parser for embedded document parsing
         TikaResource.fillParseContext(context, httpHeaders, null);


Reply via email to