Author: tallison
Date: Sun Jun 28 01:57:30 2015
New Revision: 1687981
URL: http://svn.apache.org/r1687981
Log:
TIKA-1663 add a DigestingParser
Added:
tika/trunk/tika-app/src/main/java/org/apache/tika/batch/
tika/trunk/tika-app/src/main/java/org/apache/tika/batch/DigestingAutoDetectParserFactory.java
tika/trunk/tika-app/src/main/java/org/apache/tika/batch/builders/
tika/trunk/tika-app/src/main/java/org/apache/tika/batch/builders/AppParserFactoryBuilder.java
tika/trunk/tika-app/src/main/resources/tika-app-batch-config.xml
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/IParserFactoryBuilder.java
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/ParserFactoryBuilder.java
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/utils/
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/utils/CommonsDigester.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java
tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
tika/trunk/tika-app/src/test/resources/log4j_batch_process_test.properties
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/AutoDetectParserFactory.java
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/ParserFactory.java
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/FSBatchProcessCLI.java
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java
tika/trunk/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml
tika/trunk/tika-batch/src/test/java/org/apache/tika/parser/mock/MockParserFactory.java
tika/trunk/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml
tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml
tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml
tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/DetectorResource.java
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/LanguageResource.java
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaDetectors.java
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaMimeTypes.java
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaParsers.java
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaUtils.java
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaVersion.java
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TikaWelcome.java
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/TranslateResource.java
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java
tika/trunk/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
tika/trunk/tika-server/src/test/java/org/apache/tika/server/DetectorResourceTest.java
tika/trunk/tika-server/src/test/java/org/apache/tika/server/LanguageResourceTest.java
tika/trunk/tika-server/src/test/java/org/apache/tika/server/MetadataResourceTest.java
tika/trunk/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
tika/trunk/tika-server/src/test/java/org/apache/tika/server/StackTraceOffTest.java
tika/trunk/tika-server/src/test/java/org/apache/tika/server/StackTraceTest.java
tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaDetectorsTest.java
tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaMimeTypesTest.java
tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaParsersTest.java
tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaVersionTest.java
tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaWelcomeTest.java
tika/trunk/tika-server/src/test/java/org/apache/tika/server/TranslateResourceTest.java
tika/trunk/tika-server/src/test/java/org/apache/tika/server/UnpackerResourceTest.java
Modified: tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Sun Jun 28 01:57:30 2015
@@ -1,5 +1,9 @@
Release 1.10 - Current Development
+ * Added DigestingParser to calculate digest hashes
+ and record them in metadata. Integrated with
+ tika-app and tika-server (TIKA-1663).
+
* Fixed ZipContainerDetector to detect all IPA files
(TIKA-1659).
Added:
tika/trunk/tika-app/src/main/java/org/apache/tika/batch/DigestingAutoDetectParserFactory.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/batch/DigestingAutoDetectParserFactory.java?rev=1687981&view=auto
==============================================================================
---
tika/trunk/tika-app/src/main/java/org/apache/tika/batch/DigestingAutoDetectParserFactory.java
(added)
+++
tika/trunk/tika-app/src/main/java/org/apache/tika/batch/DigestingAutoDetectParserFactory.java
Sun Jun 28 01:57:30 2015
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.batch;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.DigestingParser;
+import org.apache.tika.parser.Parser;
+
+public class DigestingAutoDetectParserFactory extends ParserFactory {
+
+ private DigestingParser.Digester digester = null;
+
+
+ @Override
+ public Parser getParser(TikaConfig config) {
+ Parser p = new AutoDetectParser(config);
+ if (digester == null) {
+ return p;
+ }
+ DigestingParser d = new DigestingParser(p, digester);
+ return d;
+ }
+
+ public void setDigester(DigestingParser.Digester digester) {
+ this.digester = digester;
+ }
+}
Added:
tika/trunk/tika-app/src/main/java/org/apache/tika/batch/builders/AppParserFactoryBuilder.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/batch/builders/AppParserFactoryBuilder.java?rev=1687981&view=auto
==============================================================================
---
tika/trunk/tika-app/src/main/java/org/apache/tika/batch/builders/AppParserFactoryBuilder.java
(added)
+++
tika/trunk/tika-app/src/main/java/org/apache/tika/batch/builders/AppParserFactoryBuilder.java
Sun Jun 28 01:57:30 2015
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.batch.builders;
+
+import java.util.Locale;
+import java.util.Map;
+
+import org.apache.tika.batch.DigestingAutoDetectParserFactory;
+import org.apache.tika.batch.ParserFactory;
+import org.apache.tika.parser.DigestingParser;
+import org.apache.tika.parser.utils.CommonsDigester;
+import org.apache.tika.util.ClassLoaderUtil;
+import org.apache.tika.util.XMLDOMUtil;
+import org.w3c.dom.Node;
+
+public class AppParserFactoryBuilder implements IParserFactoryBuilder {
+
+ @Override
+ public ParserFactory build(Node node, Map<String, String> runtimeAttrs) {
+ Map<String, String> localAttrs = XMLDOMUtil.mapifyAttrs(node,
runtimeAttrs);
+ String className = localAttrs.get("class");
+ ParserFactory pf = ClassLoaderUtil.buildClass(ParserFactory.class,
className);
+
+ if (localAttrs.containsKey("parseRecursively")) {
+ String bString =
localAttrs.get("parseRecursively").toLowerCase(Locale.ENGLISH);
+ if (bString.equals("true")) {
+ pf.setParseRecursively(true);
+ } else if (bString.equals("false")) {
+ pf.setParseRecursively(false);
+ } else {
+ throw new RuntimeException("parseRecursively must have value
of \"true\" or \"false\": "+
+ bString);
+ }
+ }
+ if (pf instanceof DigestingAutoDetectParserFactory) {
+ DigestingParser.Digester d = buildDigester(localAttrs);
+ ((DigestingAutoDetectParserFactory)pf).setDigester(d);
+ }
+ return pf;
+ }
+
+ private DigestingParser.Digester buildDigester(Map<String, String>
localAttrs) {
+ String digestString = localAttrs.get("digest");
+ CommonsDigester.DigestAlgorithm[] algos =
CommonsDigester.parse(digestString);
+
+ String readLimitString = localAttrs.get("digestMarkLimit");
+ if (readLimitString == null) {
+ throw new IllegalArgumentException("Must specify
\"digestMarkLimit\" for "+
+ "the DigestingAutoDetectParserFactory");
+ }
+ int readLimit = -1;
+
+ try {
+ readLimit = Integer.parseInt(readLimitString);
+ } catch (NumberFormatException e) {
+ throw new IllegalArgumentException("Parameter \"digestMarkLimit\"
must be a parseable int: "+
+ readLimitString);
+ }
+ return new CommonsDigester(readLimit, algos);
+ }
+}
Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
(original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Sun Jun
28 01:57:30 2015
@@ -88,6 +88,7 @@ import org.apache.tika.mime.MimeTypeExce
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.CompositeParser;
+import org.apache.tika.parser.DigestingParser;
import org.apache.tika.parser.NetworkParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
@@ -95,6 +96,7 @@ import org.apache.tika.parser.ParserDeco
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.parser.html.BoilerpipeContentHandler;
+import org.apache.tika.parser.utils.CommonsDigester;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerFactory;
@@ -108,6 +110,9 @@ import org.xml.sax.helpers.DefaultHandle
* Simple command line interface for Apache Tika.
*/
public class TikaCLI {
+
+ private final int MAX_MARK = 20*1024*1024;//20MB
+
private File extractDir = new File(".");
private static final Log logger = LogFactory.getLog(TikaCLI.class);
@@ -334,6 +339,8 @@ public class TikaCLI {
*/
private String password = System.getenv("TIKA_PASSWORD");
+ private DigestingParser.Digester digester = null;
+
private boolean pipeMode = true;
private boolean serverMode = false;
@@ -400,6 +407,11 @@ public class TikaCLI {
fork = true;
} else if (arg.startsWith("--config=")) {
configure(arg.substring("--config=".length()));
+ } else if (arg.startsWith("--digest=")) {
+ CommonsDigester.DigestAlgorithm[] algos = CommonsDigester.parse(
+ arg.substring("--digest=".length()));
+ digester = new CommonsDigester(MAX_MARK,algos);
+ parser = new DigestingParser(parser, digester);
} else if (arg.startsWith("-e")) {
encoding = arg.substring("-e".length());
} else if (arg.startsWith("--encoding=")) {
@@ -545,6 +557,8 @@ public class TikaCLI {
out.println(" with -x, -h, -t or -m; default
is -x)");
out.println(" -l or --language Output only language");
out.println(" -d or --detect Detect document type");
+ out.println(" --digest=X Include digest X (md2, md5,
sha1,");
+ out.println(" sha256, sha384, sha512");
out.println(" -eX or --encoding=X Use output encoding X");
out.println(" -pX or --password=X Use document password X");
out.println(" -z or --extract Extract all attachements into
current directory");
@@ -662,6 +676,9 @@ public class TikaCLI {
this.configFilePath = configFilePath;
TikaConfig config = new TikaConfig(new File(configFilePath));
parser = new AutoDetectParser(config);
+ if (digester != null) {
+ parser = new DigestingParser(parser, digester);
+ }
detector = config.getDetector();
context.set(Parser.class, parser);
}
Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
(original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java Sun Jun
28 01:57:30 2015
@@ -47,6 +47,10 @@ import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
import java.awt.event.KeyEvent;
import java.awt.event.WindowEvent;
+import java.awt.event.ActionEvent;
+import java.awt.event.ActionListener;
+import java.awt.event.KeyEvent;
+import java.awt.event.WindowEvent;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
@@ -71,10 +75,12 @@ import org.apache.tika.metadata.serializ
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.DigestingParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.parser.html.BoilerpipeContentHandler;
+import org.apache.tika.parser.utils.CommonsDigester;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerDecorator;
@@ -92,6 +98,9 @@ import org.xml.sax.helpers.AttributesImp
public class TikaGUI extends JFrame
implements ActionListener, HyperlinkListener {
+ //maximum length to allow for mark for reparse to get JSON
+ private static final int MAX_MARK = 20*1024*1024;//20MB
+
/**
* Serial version UID.
*/
@@ -115,13 +124,16 @@ public class TikaGUI extends JFrame
final TikaConfig finalConfig = config;
SwingUtilities.invokeLater(new Runnable() {
public void run() {
- new TikaGUI(new
AutoDetectParser(finalConfig)).setVisible(true);
+ new TikaGUI(new DigestingParser(
+ new AutoDetectParser(finalConfig),
+ new CommonsDigester(MAX_MARK,
+ CommonsDigester.DigestAlgorithm.MD5,
+ CommonsDigester.DigestAlgorithm.SHA256)
+ )).setVisible(true);
}
});
}
- //maximum length to allow for mark for reparse to get JSON
- private final int MAX_MARK = 20*1024*1024;//20MB
/**
* Parsing context.
*/
@@ -334,11 +346,22 @@ public class TikaGUI extends JFrame
getXmlContentHandler(xmlBuffer));
context.set(DocumentSelector.class, new ImageDocumentSelector());
+
+ input = TikaInputStream.get(new ProgressMonitorInputStream(
+ this, "Parsing stream", input));
+
if (input.markSupported()) {
- input.mark(MAX_MARK);
+ int mark = -1;
+ if (input instanceof TikaInputStream) {
+ if (((TikaInputStream)input).hasFile()) {
+ mark = (int)((TikaInputStream)input).getLength();
+ }
+ }
+ if (mark == -1) {
+ mark = MAX_MARK;
+ }
+ input.mark(mark);
}
- input = new ProgressMonitorInputStream(
- this, "Parsing stream", input);
parser.parse(input, handler, md, context);
String[] names = md.names();
Added: tika/trunk/tika-app/src/main/resources/tika-app-batch-config.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/resources/tika-app-batch-config.xml?rev=1687981&view=auto
==============================================================================
--- tika/trunk/tika-app/src/main/resources/tika-app-batch-config.xml (added)
+++ tika/trunk/tika-app/src/main/resources/tika-app-batch-config.xml Sun Jun 28
01:57:30 2015
@@ -0,0 +1,136 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<!-- NOTE: tika-batch is still an experimental feature.
+ The configuration file will likely change and be backward incompatible
+ with new versions of Tika. Please stay tuned.
+ -->
+
+<tika-batch-config
+ maxAliveTimeSeconds="-1"
+ pauseOnEarlyTerminationMillis="10000"
+ timeoutThresholdMillis="300000"
+ timeoutCheckPulseMillis="1000"
+ maxQueueSize="10000"
+ numConsumers="default"> <!-- numConsumers = number of file consumers,
"default" = number of processors -1 -->
+
+ <!-- options to allow on the commandline -->
+ <commandline>
+ <option opt="c" longOpt="tika-config" hasArg="true"
+ description="TikaConfig file"/>
+ <option opt="bc" longOpt="batch-config" hasArg="true"
+ description="xml batch config file"/>
+ <!-- We needed sorted for testing. We added random for performance.
+ Where crawling a directory is slow, it might be beneficial to
+ go randomly so that the parsers are triggered earlier. The
+ default is operating system's choice ("os") which means whatever
order
+ the os returns files in .listFiles(). -->
+ <option opt="crawlOrder" hasArg="true"
+ description="how does the crawler sort the directories and
files:
+ (random|sorted|os)"/>
+ <option opt="numConsumers" hasArg="true"
+ description="number of fileConsumers threads"/>
+ <option opt="maxFileSizeBytes" hasArg="true"
+ description="maximum file size to process; do not process
files larger than this"/>
+ <option opt="maxQueueSize" hasArg="true"
+ description="maximum queue size for FileResources"/>
+ <option opt="fileList" hasArg="true"
+ description="file that contains a list of files (relative to
inputDir) to process"/>
+ <option opt="fileListEncoding" hasArg="true"
+ description="encoding for fileList"/>
+ <option opt="inputDir" hasArg="true"
+ description="root directory for the files to be processed"/>
+ <option opt="startDir" hasArg="true"
+ description="directory (under inputDir) at which to start
crawling"/>
+ <option opt="outputDir" hasArg="true"
+ description="output directory for output"/> <!-- do we want to
make this mandatory -->
+ <option opt="recursiveParserWrapper"
+ description="use the RecursiveParserWrapper or not (default =
false)"/>
+ <option opt="handleExisting" hasArg="true"
+ description="if an output file already exists, do you want to:
overwrite, rename or skip"/>
+ <option opt="basicHandlerType" hasArg="true"
+ description="what type of content handler: xml, text, html,
body"/>
+ <option opt="outputSuffix" hasArg="true"
+ description="suffix to add to the end of the output file
name"/>
+ <option opt="timeoutThresholdMillis" hasArg="true"
+ description="how long to wait before determining that a
consumer is stale"/>
+ <option opt="includeFilePat" hasArg="true"
+ description="regex that specifies which files to process"/>
+ <option opt="excludeFilePat" hasArg="true"
+ description="regex that specifies which files to avoid
processing"/>
+ <option opt="reporterSleepMillis" hasArg="true"
+ description="millisecond between reports by the reporter"/>
+ <option opt="digest" hasArg="true"
+ description="which digest(s) to use, e.g. 'md5,sha512'\"/>
+ <option opt="digestMarkLimit" hasArg="true"
+ description="max bytes to read for digest\"/>
+ </commandline>
+
+
+ <!-- can specify inputDir="input", but the default config should not
include this -->
+ <!-- can also specify startDir="input/someDir" to specify which child
directory
+ to start processing -->
+ <crawler
builderClass="org.apache.tika.batch.fs.builders.FSCrawlerBuilder"
+ crawlOrder="random"
+ maxFilesToAdd="-1"
+ maxFilesToConsider="-1"
+ includeFilePat=""
+ excludeFilePat=""
+ maxFileSizeBytes="-1"
+ />
+<!--
+ This is an example of a crawler that reads a list of files to be processed
from a
+ file. This assumes that the files in the list are relative to inputDir.
+ <crawler class="org.apache.tika.batch.fs.builders.FSCrawlerBuilder"
+ fileList="files.txt"
+ fileListEncoding="UTF-8"
+ maxFilesToAdd="-1"
+ maxFilesToConsider="-1"
+ includeFilePat="(?i).pdf$"
+ excludeFilePat="(?i).msg$"
+ maxFileSizeBytes="-1"
+ inputDir="input"
+ />
+-->
+ <!--
+ To wrap parser in RecursiveParserWrapper (tika-app's -J or
tika-server's /rmeta),
+ add attribute recursiveParserWrapper="true" to consumers element.
+
+ To wrap parser with DigestingParser add attributes e.g.:
+ digest="md5,sha256" digestMarkLimit="10000000"
+ -->
+ <consumers
builderClass="org.apache.tika.batch.fs.builders.BasicTikaFSConsumersBuilder"
+ recursiveParserWrapper="false"
consumersManagerMaxMillis="60000">
+ <parser
builderClass="org.apache.tika.batch.builders.AppParserFactoryBuilder"
+ class="org.apache.tika.batch.DigestingAutoDetectParserFactory"
+ parseRecursively="true"
+ digest="md5" digestMarkLimit="1000000"/>
+ <contenthandler
builderClass="org.apache.tika.batch.builders.DefaultContentHandlerFactoryBuilder"
+ basicHandlerType="xml" writeLimit="-1"/>
+ <!-- overwritePolicy: "skip" a file if output file exists, "rename" a
output file, "overwrite" -->
+ <!-- can include e.g. outputDir="output", but we don't want to include
this in the default! -->
+ <outputstream class="FSOutputStreamFactory" encoding="UTF-8"
outputSuffix="xml"/>
+ </consumers>
+
+ <!-- reporter and interrupter are optional -->
+ <reporter
builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder"
reporterSleepMillis="1000"
+ reporterStaleThresholdMillis="60000"/>
+ <interrupter
builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
+</tika-batch-config>
\ No newline at end of file
Modified:
tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
---
tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java
(original)
+++
tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java
Sun Jun 28 01:57:30 2015
@@ -18,6 +18,7 @@
package org.apache.tika.cli;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import java.io.ByteArrayOutputStream;
@@ -126,6 +127,49 @@ public class TikaCLIBatchIntegrationTest
assertTrue(sysOutString.contains("MY_CUSTOM_LOG_CONFIG"));
}
+ @Test
+ public void testDigester() throws Exception {
+ Reader reader = null;
+/* try {
+ String[] params = {"-i", escape(testDataFile.getAbsolutePath()),
+ "-o", escape(tempDir.getAbsolutePath()),
+ "-numConsumers", "10",
+ "-J", //recursive Json
+ "-t" //plain text in content
+ };
+ TikaCLI.main(params);
+ reader = new InputStreamReader(
+ new FileInputStream(new File(tempDir,
"test_recursive_embedded.docx.json")), IOUtils.UTF_8);
+ List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
+ assertEquals(12, metadataList.size());
+ assertEquals("59f626e09a8c16ab6dbc2800c685f772",
metadataList.get(0).get("X-TIKA:digest:MD5"));
+ assertEquals("22e6e91f408d018417cd452d6de3dede",
metadataList.get(5).get("X-TIKA:digest:MD5"));
+ } finally {
+ IOUtils.closeQuietly(reader);
+ }
+*/
+ reader = null;
+ try {
+ String[] params = {"-i", escape(testDataFile.getAbsolutePath()),
+ "-o", escape(tempDir.getAbsolutePath()),
+ "-numConsumers", "10",
+ "-J", //recursive Json
+ "-t", //plain text in content
+ "-digest", "sha512"
+ };
+ TikaCLI.main(params);
+ reader = new InputStreamReader(
+ new FileInputStream(new File(tempDir,
"test_recursive_embedded.docx.json")), IOUtils.UTF_8);
+ List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
+ assertEquals(12, metadataList.size());
+ assertNotNull(metadataList.get(0).get("X-TIKA:digest:SHA512"));
+
assertTrue(metadataList.get(0).get("X-TIKA:digest:SHA512").startsWith("ee46d973ee1852c01858"));
+ } finally {
+ IOUtils.closeQuietly(reader);
+ }
+
+ }
+
public static String escape(String path) {
if (path.indexOf(' ') > -1) {
return '"' + path + '"';
Modified: tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
--- tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
(original)
+++ tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java Sun
Jun 28 01:57:30 2015
@@ -100,6 +100,12 @@ public class TikaCLITest {
String[] params = {"-x", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("?xml
version=\"1.0\" encoding=\"UTF-8\"?"));
+
+ params = new String[]{"-x", "--digest=SHA256", resourcePrefix +
"alice.cli.test"};
+ TikaCLI.main(params);
+ assertTrue(outContent.toString(IOUtils.UTF_8.name())
+ .contains("<meta name=\"X-TIKA:digest:SHA256\"
content=\"e90779adbac09c4ee"));
+
}
/**
@@ -114,6 +120,11 @@ public class TikaCLITest {
assertTrue(outContent.toString("UTF-8").contains("html
xmlns=\"http://www.w3.org/1999/xhtml"));
assertTrue("Expanded <title></title> element should be present",
outContent.toString(IOUtils.UTF_8.name()).contains("<title></title>"));
+
+ params = new String[]{"-h", "--digest=SHA384", resourcePrefix +
"alice.cli.test"};
+ TikaCLI.main(params);
+ assertTrue(outContent.toString("UTF-8")
+ .contains("<meta name=\"X-TIKA:digest:SHA384\"
content=\"c69ea023f5da95a026"));
}
/**
@@ -137,6 +148,12 @@ public class TikaCLITest {
String[] params = {"-m", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("text/plain"));
+
+ params = new String[]{"-m", "--digest=SHA512", resourcePrefix +
"alice.cli.test"};
+ TikaCLI.main(params);
+
assertTrue(outContent.toString(IOUtils.UTF_8.name()).contains("text/plain"));
+ assertTrue(outContent.toString(IOUtils.UTF_8.name())
+ .contains("X-TIKA:digest:SHA512:
dd459d99bc19ff78fd31fbae46e0"));
}
/**
@@ -146,7 +163,7 @@ public class TikaCLITest {
*/
@Test
public void testJsonMetadataOutput() throws Exception {
- String[] params = {"--json", resourcePrefix +
"testJsonMultipleInts.html"};
+ String[] params = {"--json", "--digest=MD2", resourcePrefix +
"testJsonMultipleInts.html"};
TikaCLI.main(params);
String json = outContent.toString(IOUtils.UTF_8.name());
//TIKA-1310
@@ -158,6 +175,7 @@ public class TikaCLITest {
int title = json.indexOf("\"title\"");
assertTrue(enc > -1 && fb > -1 && enc < fb);
assertTrue (fb > -1 && title > -1 && fb < title);
+
assertTrue(json.contains("\"X-TIKA:digest:MD2\":\"470481522c33aa7f6558dfc5cc0c8135\""));
}
/**
@@ -378,4 +396,14 @@ public class TikaCLITest {
assertTrue(content.contains("\\n\\nembed_4\\n"));
assertTrue(content.contains("\\n\\nembed_0"));
}
+
+ @Test
+ public void testDigestInJson() throws Exception {
+ String[] params = new String[]{"-J", "-r", "-t", "--digest=MD5",
resourcePrefix+"test_recursive_embedded.docx"};
+ TikaCLI.main(params);
+ String content = outContent.toString(IOUtils.UTF_8.name());
+ assertTrue(content.contains("\"X-TIKA:digest:MD5\":
\"59f626e09a8c16ab6dbc2800c685f772\","));
+ assertTrue(content.contains("\"X-TIKA:digest:MD5\":
\"f9627095ef86c482e61d99f0cc1cf87d\""));
+ }
+
}
Modified:
tika/trunk/tika-app/src/test/resources/log4j_batch_process_test.properties
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/resources/log4j_batch_process_test.properties?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
--- tika/trunk/tika-app/src/test/resources/log4j_batch_process_test.properties
(original)
+++ tika/trunk/tika-app/src/test/resources/log4j_batch_process_test.properties
Sun Jun 28 01:57:30 2015
@@ -14,7 +14,7 @@
# limitations under the License.
#info,debug, error,fatal ...
-log4j.rootLogger=info,stdout
+log4j.rootLogger=trace,stdout
#console
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
Modified:
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/AutoDetectParserFactory.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/AutoDetectParserFactory.java?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
---
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/AutoDetectParserFactory.java
(original)
+++
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/AutoDetectParserFactory.java
Sun Jun 28 01:57:30 2015
@@ -24,11 +24,12 @@ import org.apache.tika.parser.Parser;
/**
* Simple class for AutoDetectParser
*/
-public class AutoDetectParserFactory implements ParserFactory {
+public class AutoDetectParserFactory extends ParserFactory {
@Override
public Parser getParser(TikaConfig config) {
return new AutoDetectParser(config);
}
-
+
+
}
Modified:
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/ParserFactory.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/ParserFactory.java?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
---
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/ParserFactory.java
(original)
+++
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/ParserFactory.java
Sun Jun 28 01:57:30 2015
@@ -20,8 +20,18 @@ package org.apache.tika.batch;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.parser.Parser;
-public interface ParserFactory {
-
- public Parser getParser(TikaConfig config);
+public abstract class ParserFactory {
+
+ private boolean parseRecursively = true;
+
+ public abstract Parser getParser(TikaConfig config);
+
+ public boolean getParseRecursively() {
+ return parseRecursively;
+ }
+
+ public void setParseRecursively(boolean parseRecursively) {
+ this.parseRecursively = parseRecursively;
+ }
}
Added:
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/IParserFactoryBuilder.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/IParserFactoryBuilder.java?rev=1687981&view=auto
==============================================================================
---
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/IParserFactoryBuilder.java
(added)
+++
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/IParserFactoryBuilder.java
Sun Jun 28 01:57:30 2015
@@ -0,0 +1,12 @@
+package org.apache.tika.batch.builders;
+
+
+import java.util.Map;
+
+import org.apache.tika.batch.ParserFactory;
+import org.w3c.dom.Node;
+
+public interface IParserFactoryBuilder {
+
+ public ParserFactory build(Node node, Map<String, String> runtimeAttrs);
+}
Added:
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/ParserFactoryBuilder.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/ParserFactoryBuilder.java?rev=1687981&view=auto
==============================================================================
---
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/ParserFactoryBuilder.java
(added)
+++
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/ParserFactoryBuilder.java
Sun Jun 28 01:57:30 2015
@@ -0,0 +1,33 @@
+package org.apache.tika.batch.builders;
+
+import java.util.Locale;
+import java.util.Map;
+
+import org.apache.tika.batch.ParserFactory;
+import org.apache.tika.util.ClassLoaderUtil;
+import org.apache.tika.util.XMLDOMUtil;
+import org.w3c.dom.Node;
+
+public class ParserFactoryBuilder implements IParserFactoryBuilder {
+
+
+ @Override
+ public ParserFactory build(Node node, Map<String, String> runtimeAttrs) {
+ Map<String, String> localAttrs = XMLDOMUtil.mapifyAttrs(node,
runtimeAttrs);
+ String className = localAttrs.get("class");
+ ParserFactory pf = ClassLoaderUtil.buildClass(ParserFactory.class,
className);
+
+ if (localAttrs.containsKey("parseRecursively")) {
+ String bString =
localAttrs.get("parseRecursively").toLowerCase(Locale.ENGLISH);
+ if (bString.equals("true")) {
+ pf.setParseRecursively(true);
+ } else if (bString.equals("false")) {
+ pf.setParseRecursively(false);
+ } else {
+ throw new RuntimeException("parseRecursively must have value
of \"true\" or \"false\": "+
+ bString);
+ }
+ }
+ return pf;
+ }
+}
Modified:
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/FSBatchProcessCLI.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/FSBatchProcessCLI.java?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
---
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/FSBatchProcessCLI.java
(original)
+++
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/FSBatchProcessCLI.java
Sun Jun 28 01:57:30 2015
@@ -19,6 +19,7 @@ package org.apache.tika.batch.fs;
import java.io.File;
import java.io.IOException;
+import java.net.URL;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.ExecutorService;
@@ -74,10 +75,17 @@ public class FSBatchProcessCLI {
is = TikaInputStream.get(batchConfigFile);
} else {
if (logDefault) {
- logger.info("No config file set via -bc, relying on
default-tika-batch-config.xml");
+ logger.info("No config file set via -bc, relying on
tika-app-batch-config.xml or default-tika-batch-config.xml");
+ }
+ //test to see if there's a tika-app-batch-config.xml on the path
+ URL config =
FSBatchProcessCLI.class.getResource("/tika-app-batch-config.xml");
+ if (config != null) {
+ is = TikaInputStream.get(
+
FSBatchProcessCLI.class.getResourceAsStream("/tika-app-batch-config.xml"));
+ } else {
+ is = TikaInputStream.get(
+
FSBatchProcessCLI.class.getResourceAsStream("default-tika-batch-config.xml"));
}
- is = TikaInputStream.get(
-
FSBatchProcessCLI.class.getResourceAsStream("default-tika-batch-config.xml"));
}
return is;
}
Modified:
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
---
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java
(original)
+++
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java
Sun Jun 28 01:57:30 2015
@@ -31,6 +31,7 @@ import org.apache.tika.batch.ParserFacto
import org.apache.tika.batch.builders.AbstractConsumersBuilder;
import org.apache.tika.batch.builders.BatchProcessBuilder;
import org.apache.tika.batch.builders.IContentHandlerFactoryBuilder;
+import org.apache.tika.batch.builders.IParserFactoryBuilder;
import org.apache.tika.batch.fs.BasicTikaFSConsumer;
import org.apache.tika.batch.fs.FSConsumersManager;
import org.apache.tika.batch.fs.FSOutputStreamFactory;
@@ -156,10 +157,10 @@ public class BasicTikaFSConsumersBuilder
}
private ParserFactory getParserFactory(Node node, Map<String, String>
runtimeAttributes) {
- //TODO: add ability to set TikaConfig file path
Map<String, String> localAttrs = XMLDOMUtil.mapifyAttrs(node,
runtimeAttributes);
- String className = localAttrs.get("class");
- return ClassLoaderUtil.buildClass(ParserFactory.class, className);
+ String className = localAttrs.get("builderClass");
+ IParserFactoryBuilder builder =
ClassLoaderUtil.buildClass(IParserFactoryBuilder.class, className);
+ return builder.build(node, runtimeAttributes);
}
private OutputStreamFactory getOutputStreamFactory(Node node, Map<String,
String> runtimeAttributes) {
Modified:
tika/trunk/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
---
tika/trunk/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml
(original)
+++
tika/trunk/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml
Sun Jun 28 01:57:30 2015
@@ -103,15 +103,20 @@
excludeFilePat="(?i).msg$"
maxFileSizeBytes="-1"
inputDir="input"
- />
--->
- <consumers
builderClass="org.apache.tika.batch.fs.builders.BasicTikaFSConsumersBuilder"
- recursiveParserWrapper="false"
consumersManagerMaxMillis="60000">
- <parser class="org.apache.tika.batch.AutoDetectParserFactory"
parseRecursively="true"/>
- <contenthandler
builderClass="org.apache.tika.batch.builders.DefaultContentHandlerFactoryBuilder"
+ />
+-->
+ <!--
+ To wrap parser in RecursiveParserWrapper (tika-app's -J or
tika-server's /rmeta),
+ add attribute recursiveParserWrapper="true" to consumers element.
+ -->
+ <consumers
builderClass="org.apache.tika.batch.fs.builders.BasicTikaFSConsumersBuilder"
+ recursiveParserWrapper="false"
consumersManagerMaxMillis="60000">
+ <parser
builderClass="org.apache.tika.batch.builders.ParserFactoryBuilder"
+ class="org.apache.tika.batch.AutoDetectParserFactory"
+ parseRecursively="true"/>
+ <contenthandler
builderClass="org.apache.tika.batch.builders.DefaultContentHandlerFactoryBuilder"
basicHandlerType="xml" writeLimit="-1"/>
- <!-- overwritePolicy: "skip" a file if output file exists, "rename" a
output file, "overwrite" -->
- <!-- can include e.g. outputDir="output", but we don't want to include
this in the default! -->
+ <!-- overwritePolicy: "skip" a file if output file exists, "rename" a
output file, "overwrite" --> <!-- can include e.g. outputDir="output",
but we don't want to include this in the default! -->
<outputstream class="FSOutputStreamFactory" encoding="UTF-8"
outputSuffix="xml"/>
</consumers>
Modified:
tika/trunk/tika-batch/src/test/java/org/apache/tika/parser/mock/MockParserFactory.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/java/org/apache/tika/parser/mock/MockParserFactory.java?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
---
tika/trunk/tika-batch/src/test/java/org/apache/tika/parser/mock/MockParserFactory.java
(original)
+++
tika/trunk/tika-batch/src/test/java/org/apache/tika/parser/mock/MockParserFactory.java
Sun Jun 28 01:57:30 2015
@@ -21,9 +21,11 @@ import org.apache.tika.batch.ParserFacto
import org.apache.tika.config.TikaConfig;
import org.apache.tika.parser.Parser;
-public class MockParserFactory implements ParserFactory {
+public class MockParserFactory extends ParserFactory {
+
@Override
public Parser getParser(TikaConfig config) {
return new MockParser();
}
+
}
Modified:
tika/trunk/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
---
tika/trunk/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml
(original)
+++
tika/trunk/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml
Sun Jun 28 01:57:30 2015
@@ -96,11 +96,12 @@
<consumers
builderClass="org.apache.tika.batch.mock.MockConsumersBuilder"
recursiveParserWrapper="false" consumersManagerMaxMillis="1000">
- <parser class="org.apache.tika.parser.mock.MockParserFactory"
parseRecursively="true"/>
+ <parser
builderClass="org.apache.tika.batch.builders.ParserFactoryBuilder"
+ class="org.apache.tika.parser.mock.MockParserFactory"
+ parseRecursively="true"/>
<contenthandler
builderClass="org.apache.tika.batch.builders.DefaultContentHandlerFactoryBuilder"
basicHandlerType="xml" writeLimit="-1"/>
-
<outputstream class="FSOutputStreamFactory"
encoding="UTF-8" outputSuffix="xml"/>
</consumers>
Modified: tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
--- tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml
(original)
+++ tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml Sun
Jun 28 01:57:30 2015
@@ -89,11 +89,13 @@
<consumers
builderClass="org.apache.tika.batch.fs.builders.BasicTikaFSConsumersBuilder"
recursiveParserWrapper="false">
- <parser class="org.apache.tika.parser.mock.MockParserFactory"
parseRecursively="true"/>
+ <parser
builderClass="org.apache.tika.batch.builders.ParserFactoryBuilder"
+ class="org.apache.tika.batch.AutoDetectParserFactory"
+ parseRecursively="true"/>
+
<contenthandler
builderClass="org.apache.tika.batch.builders.DefaultContentHandlerFactoryBuilder"
basicHandlerType="xml" writeLimit="-1"/>
-
<outputstream class="FSOutputStreamFactory"
encoding="UTF-8" outputSuffix="xml"/>
</consumers>
Modified: tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
--- tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml
(original)
+++ tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml Sun Jun
28 01:57:30 2015
@@ -95,11 +95,12 @@
<consumers
builderClass="org.apache.tika.batch.fs.builders.BasicTikaFSConsumersBuilder"
recursiveParserWrapper="false"
consumersManagerMaxMillis="120000">
- <parser class="org.apache.tika.parser.mock.MockParserFactory"
parseRecursively="true"/>
+ <parser
builderClass="org.apache.tika.batch.builders.ParserFactoryBuilder"
+ class="org.apache.tika.parser.mock.MockParserFactory"
+ parseRecursively="true"/>
<contenthandler
builderClass="org.apache.tika.batch.builders.DefaultContentHandlerFactoryBuilder"
basicHandlerType="xml" writeLimit="-1"/>
-
<outputstream class="FSOutputStreamFactory"
encoding="UTF-8" outputSuffix="xml"/>
</consumers>
Added:
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java?rev=1687981&view=auto
==============================================================================
---
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
(added)
+++
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
Sun Jun 28 01:57:30 2015
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser;
+
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class DigestingParser extends ParserDecorator {
+
+ /**
+ * Interface for optional digester, if specified during construction.
+ * See org.apache.parser.utils.CommonsDigester in tika-parsers for an
+ * implementation.
+ */
+ public interface Digester {
+ /**
+ * Digests an InputStream and sets the appropriate value(s) in the
metadata.
+ * The Digester is also responsible for marking and resetting the
stream.
+ * <p>
+ * The given stream is guaranteed to support the
+ * {@link InputStream#markSupported() mark feature} and the detector
+ * is expected to {@link InputStream#mark(int) mark} the stream before
+ * reading any bytes from it, and to {@link InputStream#reset() reset}
+ * the stream before returning. The stream must not be closed by the
+ * detector.
+ *
+ * @param is InputStream to digest
+ * @param m Metadata to set the values for
+ * @param parseContext ParseContext
+ * @throws IOException
+ */
+ void digest(InputStream is, Metadata m, ParseContext parseContext)
throws IOException;
+
+
+ };
+
+ private final Digester digester;
+ /**
+ * Creates a decorator for the given parser.
+ *
+ * @param parser the parser instance to be decorated
+ */
+ public DigestingParser(Parser parser, Digester digester) {
+ super(parser);
+ this.digester = digester;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata
metadata, ParseContext context) throws IOException, SAXException, TikaException
{
+ if (digester != null) {
+ digester.digest(stream, metadata, context);
+ }
+ super.parse(stream, handler, metadata, context);
+ }
+}
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/utils/CommonsDigester.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/utils/CommonsDigester.java?rev=1687981&view=auto
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/utils/CommonsDigester.java
(added)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/utils/CommonsDigester.java
Sun Jun 28 01:57:30 2015
@@ -0,0 +1,299 @@
+package org.apache.tika.parser.utils;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Locale;
+
+import org.apache.commons.codec.digest.DigestUtils;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.DigestingParser;
+import org.apache.tika.parser.ParseContext;
+
+/**
+ * Implementation of {@link org.apache.tika.parser.DigestingParser.Digester}
+ * that relies on commons.codec.digest.DigestUtils to calculate digest hashes.
+ * <p>
+ * This digester tries to use the regular mark/reset protocol on the
InputStream.
+ * However, this wraps an internal BoundedInputStream, and if the InputStream
+ * is not fully read, then this will reset the stream and
+ * spool the InputStream to disk (via TikaInputStream) and then digest the
file.
+ * <p>
+ * If a TikaInputStream is passed in and it has an underlying file that is
longer
+ * than the {@link #markLimit}, then this digester digests the file directly.
+ *
+ */
+public class CommonsDigester implements DigestingParser.Digester {
+
+ public enum DigestAlgorithm {
+ //those currently available in commons.digest
+ MD2,
+ MD5,
+ SHA1,
+ SHA256,
+ SHA384,
+ SHA512;
+
+ String getMetadataKey() {
+ return TikaCoreProperties.TIKA_META_PREFIX+
+
"digest"+Metadata.NAMESPACE_PREFIX_DELIMITER+this.toString();
+ }
+ }
+
+ private final List<DigestAlgorithm> algorithms = new
ArrayList<DigestAlgorithm>();
+ private final int markLimit;
+
+ public CommonsDigester(int markLimit, DigestAlgorithm... algorithms) {
+ Collections.addAll(this.algorithms, algorithms);
+ if (markLimit < 0) {
+ throw new IllegalArgumentException("markLimit must be >= 0");
+ }
+ this.markLimit = markLimit;
+ }
+
+ @Override
+ public void digest(InputStream is, Metadata m, ParseContext parseContext)
throws IOException {
+ InputStream tis = TikaInputStream.get(is);
+ long sz = -1;
+ if (((TikaInputStream)tis).hasFile()) {
+ sz = ((TikaInputStream)tis).getLength();
+ }
+ //if the file is definitely a file,
+ //and its size is greater than its mark limit,
+ //just digest the underlying file.
+ if (sz > markLimit) {
+ digestFile(((TikaInputStream)tis).getFile(), m);
+ return;
+ }
+
+ //try the usual mark/reset stuff.
+ //however, if you actually hit the bound,
+ //then stop and spool to file via TikaInputStream
+ SimpleBoundedInputStream bis = new SimpleBoundedInputStream(markLimit,
tis);
+ boolean finishedStream = false;
+ for (DigestAlgorithm algorithm : algorithms) {
+ bis.mark(markLimit + 1);
+ finishedStream = digestEach(algorithm, bis, m);
+ bis.reset();
+ if (!finishedStream) {
+ break;
+ }
+ }
+ if (!finishedStream) {
+ digestFile(((TikaInputStream)tis).getFile(), m);
+ }
+ }
+
+ private void digestFile(File f, Metadata m) throws IOException {
+ for (DigestAlgorithm algorithm : algorithms) {
+ InputStream is = new FileInputStream(f);
+ try {
+ digestEach(algorithm, is, m);
+ } finally {
+ IOUtils.closeQuietly(is);
+ }
+ }
+ }
+
+ /**
+ *
+ * @param algorithm algo to use
+ * @param is input stream to read from
+ * @param metadata metadata for reporting the digest
+ * @return whether or not this finished the input stream
+ * @throws IOException
+ */
+ private boolean digestEach(DigestAlgorithm algorithm,
+ InputStream is, Metadata metadata) throws
IOException {
+ String digest = null;
+ try {
+ switch (algorithm) {
+ case MD2:
+ digest = DigestUtils.md2Hex(is);
+ break;
+ case MD5:
+ digest = DigestUtils.md5Hex(is);
+ break;
+ case SHA1:
+ digest = DigestUtils.sha1Hex(is);
+ break;
+ case SHA256:
+ digest = DigestUtils.sha256Hex(is);
+ break;
+ case SHA384:
+ digest = DigestUtils.sha384Hex(is);
+ break;
+ case SHA512:
+ digest = DigestUtils.sha512Hex(is);
+ break;
+ default:
+ throw new IllegalArgumentException("Sorry, not aware of
algorithm: " + algorithm.toString());
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ //swallow, or should we throw this?
+ }
+ if (is instanceof SimpleBoundedInputStream) {
+ if (((SimpleBoundedInputStream)is).hasHitBound()) {
+ return false;
+ }
+ }
+ metadata.set(algorithm.getMetadataKey(), digest);
+ return true;
+ }
+
+ /**
+ *
+ * @param s comma-delimited (no space) list of algorithms to use:
md5,sha256
+ * @return
+ */
+ public static DigestAlgorithm[] parse(String s) {
+ assert(s != null);
+
+ List<DigestAlgorithm> ret = new ArrayList<DigestAlgorithm>();
+ for (String algoString : s.split(",")) {
+ String uc = algoString.toUpperCase(Locale.ROOT);
+ if (uc.equals(DigestAlgorithm.MD2.toString())) {
+ ret.add(DigestAlgorithm.MD2);
+ } else if (uc.equals(DigestAlgorithm.MD5.toString())) {
+ ret.add(DigestAlgorithm.MD5);
+ } else if (uc.equals(DigestAlgorithm.SHA1.toString())) {
+ ret.add(DigestAlgorithm.SHA1);
+ } else if (uc.equals(DigestAlgorithm.SHA256.toString())) {
+ ret.add(DigestAlgorithm.SHA256);
+ } else if (uc.equals(DigestAlgorithm.SHA384.toString())) {
+ ret.add(DigestAlgorithm.SHA384);
+ } else if (uc.equals(DigestAlgorithm.SHA512.toString())) {
+ ret.add(DigestAlgorithm.SHA512);
+ } else {
+ StringBuilder sb = new StringBuilder();
+ int i = 0;
+ for (DigestAlgorithm algo : DigestAlgorithm.values()) {
+ if (i++ > 0) {
+ sb.append(", ");
+ }
+ sb.append(algo.toString());
+ }
+ throw new IllegalArgumentException("Couldn't match " + s + "
with any of: " + sb.toString());
+ }
+ }
+ return ret.toArray(new DigestAlgorithm[ret.size()]);
+ }
+
+ /**
+ * Very slight modification of Commons' BoundedInputStream
+ * so that we can figure out if this hit the bound or not.
+ */
+ private class SimpleBoundedInputStream extends InputStream {
+ private final static int EOF = -1;
+ private final long max;
+ private final InputStream in;
+ private long pos;
+ boolean hitBound = false;
+
+ private SimpleBoundedInputStream(long max, InputStream in) {
+ this.max = max;
+ this.in = in;
+ }
+
+ @Override
+ public int read() throws IOException {
+ if (max >= 0 && pos >= max) {
+ hitBound = true;
+ return EOF;
+ }
+ final int result = in.read();
+ pos++;
+ return result;
+ }
+
+ /**
+ * Invokes the delegate's <code>read(byte[])</code> method.
+ * @param b the buffer to read the bytes into
+ * @return the number of bytes read or -1 if the end of stream or
+ * the limit has been reached.
+ * @throws IOException if an I/O error occurs
+ */
+ @Override
+ public int read(final byte[] b) throws IOException {
+ return this.read(b, 0, b.length);
+ }
+
+ /**
+ * Invokes the delegate's <code>read(byte[], int, int)</code> method.
+ * @param b the buffer to read the bytes into
+ * @param off The start offset
+ * @param len The number of bytes to read
+ * @return the number of bytes read or -1 if the end of stream or
+ * the limit has been reached.
+ * @throws IOException if an I/O error occurs
+ */
+ @Override
+ public int read(final byte[] b, final int off, final int len) throws
IOException {
+ if (max>=0 && pos>=max) {
+ return EOF;
+ }
+ final long maxRead = max>=0 ? Math.min(len, max-pos) : len;
+ final int bytesRead = in.read(b, off, (int)maxRead);
+
+ if (bytesRead==EOF) {
+ return EOF;
+ }
+
+ pos+=bytesRead;
+ return bytesRead;
+ }
+
+ /**
+ * Invokes the delegate's <code>skip(long)</code> method.
+ * @param n the number of bytes to skip
+ * @return the actual number of bytes skipped
+ * @throws IOException if an I/O error occurs
+ */
+ @Override
+ public long skip(final long n) throws IOException {
+ final long toSkip = max>=0 ? Math.min(n, max-pos) : n;
+ final long skippedBytes = in.skip(toSkip);
+ pos+=skippedBytes;
+ return skippedBytes;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ in.reset();
+ }
+
+ @Override
+ public void mark(int readLimit) {
+ in.mark(readLimit);
+ }
+
+ public boolean hasHitBound() {
+ return hitBound;
+ }
+ }
+}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java
(original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java Sun Jun
28 01:57:30 2015
@@ -106,6 +106,10 @@ public abstract class TikaTest {
}
}
+ protected XMLResult getXML(String filePath, Parser parser, Metadata
metadata) throws Exception {
+ return getXML(getResourceAsStream("/test-documents/" + filePath),
parser, metadata);
+ }
+
protected XMLResult getXML(String filePath, Metadata metadata) throws
Exception {
return getXML(getResourceAsStream("/test-documents/" + filePath), new
AutoDetectParser(), metadata);
}
Added:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java?rev=1687981&view=auto
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java
(added)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java
Sun Jun 28 01:57:30 2015
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import static junit.framework.TestCase.assertEquals;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.utils.CommonsDigester;
+import org.junit.Test;
+
+
+public class DigestingParserTest extends TikaTest {
+
+ private final static String P = TikaCoreProperties.TIKA_META_PREFIX+
+ "digest"+Metadata.NAMESPACE_PREFIX_DELIMITER;
+
+ private final int UNLIMITED = 1000000;//well, not really, but longer than
input file
+ private final Parser p = new AutoDetectParser();
+
+ @Test
+ public void testBasic() throws Exception {
+ Map<CommonsDigester.DigestAlgorithm, String> expected =
+ new HashMap<CommonsDigester.DigestAlgorithm, String>();
+
+
expected.put(CommonsDigester.DigestAlgorithm.MD2,"d768c8e27b0b52c6eaabfaa7122d1d4f");
+
expected.put(CommonsDigester.DigestAlgorithm.MD5,"59f626e09a8c16ab6dbc2800c685f772");
+
expected.put(CommonsDigester.DigestAlgorithm.SHA1,"7a1f001d163ac90d8ea54c050faf5a38079788a6");
+
expected.put(CommonsDigester.DigestAlgorithm.SHA256,"c4b7fab030a8b6a9d6691f6699ac8e6f"
+
+
"82bc53764a0f1430d134ae3b70c32654");
+
expected.put(CommonsDigester.DigestAlgorithm.SHA384,"ebe368b9326fef44408290724d187553"+
+
"8b8a6923fdf251ddab72c6e4b5d54160" +
+
"9db917ba4260d1767995a844d8d654df");
+
expected.put(CommonsDigester.DigestAlgorithm.SHA512,"ee46d973ee1852c018580c242955974d"+
+
"da4c21f36b54d7acd06fcf68e974663b"+
+
"fed1d256875be58d22beacf178154cc3"+
+
"a1178cb73443deaa53aa0840324708bb");
+
+ //test each one
+ for (CommonsDigester.DigestAlgorithm algo :
CommonsDigester.DigestAlgorithm.values()) {
+ Metadata m = new Metadata();
+ XMLResult xml = getXML("test_recursive_embedded.docx",
+ new DigestingParser(p, new CommonsDigester(UNLIMITED,
algo)), m);
+ assertEquals(algo.toString(), expected.get(algo), m.get(P +
algo.toString()));
+ }
+
+
+ //test comma separated
+ CommonsDigester.DigestAlgorithm[] algos =
CommonsDigester.parse("md5,sha256,sha384,sha512");
+ Metadata m = new Metadata();
+ XMLResult xml = getXML("test_recursive_embedded.docx",
+ new DigestingParser(p, new CommonsDigester(UNLIMITED, algos)),
m);
+ for (CommonsDigester.DigestAlgorithm algo : new
CommonsDigester.DigestAlgorithm[]{
+ CommonsDigester.DigestAlgorithm.MD5,
+ CommonsDigester.DigestAlgorithm.SHA256,
+ CommonsDigester.DigestAlgorithm.SHA384,
+ CommonsDigester.DigestAlgorithm.SHA512}) {
+ assertEquals(algo.toString(), expected.get(algo), m.get(P +
algo.toString()));
+ }
+
+ assertNull(m.get(P+CommonsDigester.DigestAlgorithm.MD2.toString()));
+ assertNull(m.get(P+CommonsDigester.DigestAlgorithm.SHA1.toString()));
+
+ }
+
+ @Test
+ public void testLimitedRead() throws Exception {
+ CommonsDigester.DigestAlgorithm algo =
CommonsDigester.DigestAlgorithm.MD5;
+ int limit = 100;
+ byte[] bytes = new byte[limit];
+ InputStream is =
getResourceAsStream("/test-documents/test_recursive_embedded.docx");
+ is.read(bytes, 0, limit);
+ is.close();
+ Metadata m = new Metadata();
+ try {
+ XMLResult xml = getXML(TikaInputStream.get(bytes),
+ new DigestingParser(p, new CommonsDigester(100, algo)), m);
+ } catch (TikaException e) {
+ //thrown because this is just a file fragment
+ assertContains("Unexpected RuntimeException from
org.apache.tika.parser.microsoft.ooxml.OOXMLParser",
+ e.getMessage());
+ }
+ String expectedMD5 = m.get(P+"MD5");
+
+ m = new Metadata();
+ XMLResult xml = getXML("test_recursive_embedded.docx",
+ new DigestingParser(p, new CommonsDigester(100, algo)), m);
+ assertEquals(expectedMD5, m.get(P+"MD5"));
+ }
+
+ @Test
+ public void testReset() throws Exception {
+ String expectedMD5 = "1643c2cef21e36720c54f4f6cb3349d0";
+ Metadata m = new Metadata();
+ XMLResult xml = getXML("test_recursive_embedded.docx",
+ new DigestingParser(p, new CommonsDigester(100,
CommonsDigester.DigestAlgorithm.MD5)), m);
+ assertEquals(expectedMD5, m.get(P+"MD5"));
+ }
+
+ @Test
+ public void testNegativeMaxMarkLength() throws Exception {
+ Metadata m = new Metadata();
+ boolean ex = false;
+ try {
+ XMLResult xml = getXML("test_recursive_embedded.docx",
+ new DigestingParser(p, new CommonsDigester(-1,
CommonsDigester.DigestAlgorithm.MD5)), m);
+ } catch (IllegalArgumentException e) {
+ ex = true;
+ }
+ assertTrue("Exception not thrown", ex);
+ }
+
+}
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
Sun Jun 28 01:57:30 2015
@@ -30,8 +30,10 @@ import java.util.Set;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.IOUtils;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.parser.utils.CommonsDigester;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.ContentHandlerFactory;
import org.junit.Test;
@@ -210,7 +212,7 @@ public class RecursiveParserWrapperTest
metadata.set(Metadata.RESOURCE_NAME_KEY,
"test_recursive_embedded_npe.docx");
list = getMetadata(metadata,
new
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1),
- false);
+ false, null);
//Composite parser swallows caught TikaExceptions, IOExceptions and
SAXExceptions
//and just doesn't bother to report that there was an exception.
@@ -260,10 +262,30 @@ public class RecursiveParserWrapperTest
assertEquals("embeddedAuthor", embeddedMetadata.get("author"));
}
+ @Test
+ public void testDigesters() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY,
"test_recursive_embedded.docx");
+ List<Metadata> list = getMetadata(metadata,
+ new
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1),
+ true, new CommonsDigester(100000,
CommonsDigester.DigestAlgorithm.MD5));
+ int i = 0;
+ Metadata m0 = list.get(0);
+ Metadata m6 = list.get(6);
+ String md5Key = "X-TIKA:digest:MD5";
+ assertEquals("59f626e09a8c16ab6dbc2800c685f772",
list.get(0).get(md5Key));
+ assertEquals("ccdf3882e7e4c2454e28884db9b0a54d",
list.get(6).get(md5Key));
+ assertEquals("a869bf6432ebd14e19fc79416274e0c9",
list.get(7).get(md5Key));
+ }
+
private List<Metadata> getMetadata(Metadata metadata,
ContentHandlerFactory contentHandlerFactory,
- boolean catchEmbeddedExceptions) throws
Exception {
+ boolean catchEmbeddedExceptions,
+ DigestingParser.Digester digester)
throws Exception {
ParseContext context = new ParseContext();
Parser wrapped = new AutoDetectParser();
+ if (digester != null) {
+ wrapped = new DigestingParser(wrapped, digester);
+ }
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped,
contentHandlerFactory, catchEmbeddedExceptions);
String path = metadata.get(Metadata.RESOURCE_NAME_KEY);
@@ -274,8 +296,7 @@ public class RecursiveParserWrapperTest
}
InputStream stream = null;
try {
- stream = RecursiveParserWrapperTest.class.getResourceAsStream(
- path);
+ stream =
TikaInputStream.get(RecursiveParserWrapperTest.class.getResource(path).toURI());
wrapper.parse(stream, new DefaultHandler(), metadata, context);
} finally {
IOUtils.closeQuietly(stream);
@@ -286,6 +307,6 @@ public class RecursiveParserWrapperTest
private List<Metadata> getMetadata(Metadata metadata,
ContentHandlerFactory contentHandlerFactory)
throws Exception {
- return getMetadata(metadata, contentHandlerFactory, true);
+ return getMetadata(metadata, contentHandlerFactory, true, null);
}
}
Modified:
tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
---
tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
(original)
+++
tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
Sun Jun 28 01:57:30 2015
@@ -38,11 +38,12 @@ import org.apache.cxf.jaxrs.lifecycle.Si
import org.apache.cxf.rs.security.cors.CrossOriginResourceSharingFilter;
import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
+import org.apache.tika.parser.DigestingParser;
+import org.apache.tika.parser.utils.CommonsDigester;
import org.apache.tika.server.resource.DetectorResource;
+import org.apache.tika.server.resource.LanguageResource;
import org.apache.tika.server.resource.MetadataResource;
import org.apache.tika.server.resource.RecursiveMetadataResource;
-import org.apache.tika.server.writer.TarWriter;
-import org.apache.tika.server.resource.LanguageResource;
import org.apache.tika.server.resource.TikaDetectors;
import org.apache.tika.server.resource.TikaMimeTypes;
import org.apache.tika.server.resource.TikaParsers;
@@ -54,12 +55,14 @@ import org.apache.tika.server.resource.U
import org.apache.tika.server.writer.CSVMessageBodyWriter;
import org.apache.tika.server.writer.JSONMessageBodyWriter;
import org.apache.tika.server.writer.MetadataListMessageBodyWriter;
+import org.apache.tika.server.writer.TarWriter;
import org.apache.tika.server.writer.TextMessageBodyWriter;
import org.apache.tika.server.writer.XMPMessageBodyWriter;
import org.apache.tika.server.writer.ZipWriter;
public class TikaServerCli {
public static final int DEFAULT_PORT = 9998;
+ private static final int DEFAULT_DIGEST_MARK_LIMIT = 20*1024*1024;
public static final String DEFAULT_HOST = "localhost";
public static final Set<String> LOG_LEVELS =
new HashSet<String>(Arrays.asList("debug", "info"));
@@ -71,6 +74,8 @@ public class TikaServerCli {
options.addOption("h", "host", true, "host name (default = " +
DEFAULT_HOST + ')');
options.addOption("p", "port", true, "listen port (default = " +
DEFAULT_PORT + ')');
options.addOption("c", "config", true, "Tika Configuration file to
override default config with.");
+ options.addOption("d", "digest", true, "include digest in metadata,
e.g. md5,sha256");
+ options.addOption("dml", "digestMarkLimit", true, "max number of bytes
to mark on stream for digest");
options.addOption("l", "log", true, "request URI log level ('debug' or
'info')");
options.addOption("s", "includeStack", false, "whether or not to
return a stack trace\nif there is an exception during 'parse'");
options.addOption("?", "help", false, "this help message");
@@ -143,22 +148,39 @@ public class TikaServerCli {
tika = TikaConfig.getDefaultConfig();
}
+ DigestingParser.Digester digester = null;
+ if (line.hasOption("digest")){
+ int digestMarkLimit = DEFAULT_DIGEST_MARK_LIMIT;
+ if (line.hasOption("dml")) {
+ String dmlS = line.getOptionValue("dml");
+ try {
+ digestMarkLimit = Integer.parseInt(dmlS);
+ } catch (NumberFormatException e) {
+ throw new RuntimeException("Must have parseable int
after digestMarkLimit(dml): "+dmlS);
+ }
+ }
+ digester = new CommonsDigester(digestMarkLimit,
+ CommonsDigester.parse(line.getOptionValue("digest")));
+ }
+
+
+ TikaResource.init(tika, digester);
JAXRSServerFactoryBean sf = new JAXRSServerFactoryBean();
List<ResourceProvider> rCoreProviders = new
ArrayList<ResourceProvider>();
- rCoreProviders.add(new SingletonResourceProvider(new
MetadataResource(tika)));
- rCoreProviders.add(new SingletonResourceProvider(new
RecursiveMetadataResource(tika)));
- rCoreProviders.add(new SingletonResourceProvider(new
DetectorResource(tika)));
- rCoreProviders.add(new SingletonResourceProvider(new
LanguageResource(tika)));
- rCoreProviders.add(new SingletonResourceProvider(new
TranslateResource(tika)));
- rCoreProviders.add(new SingletonResourceProvider(new
TikaResource(tika)));
- rCoreProviders.add(new SingletonResourceProvider(new
UnpackerResource(tika)));
- rCoreProviders.add(new SingletonResourceProvider(new
TikaMimeTypes(tika)));
- rCoreProviders.add(new SingletonResourceProvider(new
TikaDetectors(tika)));
- rCoreProviders.add(new SingletonResourceProvider(new
TikaParsers(tika)));
- rCoreProviders.add(new SingletonResourceProvider(new
TikaVersion(tika)));
+ rCoreProviders.add(new SingletonResourceProvider(new
MetadataResource()));
+ rCoreProviders.add(new SingletonResourceProvider(new
RecursiveMetadataResource()));
+ rCoreProviders.add(new SingletonResourceProvider(new
DetectorResource()));
+ rCoreProviders.add(new SingletonResourceProvider(new
LanguageResource()));
+ rCoreProviders.add(new SingletonResourceProvider(new
TranslateResource()));
+ rCoreProviders.add(new SingletonResourceProvider(new
TikaResource()));
+ rCoreProviders.add(new SingletonResourceProvider(new
UnpackerResource()));
+ rCoreProviders.add(new SingletonResourceProvider(new
TikaMimeTypes()));
+ rCoreProviders.add(new SingletonResourceProvider(new
TikaDetectors()));
+ rCoreProviders.add(new SingletonResourceProvider(new
TikaParsers()));
+ rCoreProviders.add(new SingletonResourceProvider(new
TikaVersion()));
List<ResourceProvider> rAllProviders = new
ArrayList<ResourceProvider>(rCoreProviders);
- rAllProviders.add(new SingletonResourceProvider(new
TikaWelcome(tika, rCoreProviders)));
+ rAllProviders.add(new SingletonResourceProvider(new
TikaWelcome(rCoreProviders)));
sf.setResourceProviders(rAllProviders);
List<Object> providers = new ArrayList<Object>();
Modified:
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/DetectorResource.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/DetectorResource.java?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
---
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/DetectorResource.java
(original)
+++
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/DetectorResource.java
Sun Jun 28 01:57:30 2015
@@ -24,13 +24,11 @@ import javax.ws.rs.Produces;
import javax.ws.rs.core.Context;
import javax.ws.rs.core.HttpHeaders;
import javax.ws.rs.core.UriInfo;
-
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
-import org.apache.tika.config.TikaConfig;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -41,12 +39,6 @@ public class DetectorResource {
private static final Log logger = LogFactory.getLog(DetectorResource.class
.getName());
- private TikaConfig config = null;
-
- public DetectorResource(TikaConfig config) {
- this.config = config;
- }
-
@PUT
@Path("stream")
@Consumes("*/*")
@@ -60,7 +52,7 @@ public class DetectorResource {
logger.info("Detecting media type for Filename: " + filename);
met.add(Metadata.RESOURCE_NAME_KEY, filename);
try {
- return this.config.getDetector().detect(tis, met).toString();
+ return TikaResource.getConfig().getDetector().detect(tis,
met).toString();
} catch (IOException e) {
logger.warn("Unable to detect MIME type for file. Reason: "
+ e.getMessage());
Modified:
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/LanguageResource.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/LanguageResource.java?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
---
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/LanguageResource.java
(original)
+++
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/LanguageResource.java
Sun Jun 28 01:57:30 2015
@@ -17,36 +17,27 @@
package org.apache.tika.server.resource;
-import java.io.IOException;
-import java.io.InputStream;
-
import javax.ws.rs.Consumes;
import javax.ws.rs.POST;
import javax.ws.rs.PUT;
import javax.ws.rs.Path;
import javax.ws.rs.Produces;
+import java.io.IOException;
+import java.io.InputStream;
+import com.google.common.base.Charsets;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
-import org.apache.tika.config.TikaConfig;
import org.apache.tika.language.LanguageIdentifier;
import org.apache.tika.language.LanguageProfile;
-import com.google.common.base.Charsets;
-
@Path("/language")
public class LanguageResource {
private static final Log logger =
LogFactory.getLog(LanguageResource.class
.getName());
- private TikaConfig config;
-
- public LanguageResource(TikaConfig config) {
- this.config = config;
- }
-
@PUT
@POST
@Path("/stream")
Modified:
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java?rev=1687981&r1=1687980&r2=1687981&view=diff
==============================================================================
---
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java
(original)
+++
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/MetadataResource.java
Sun Jun 28 01:57:30 2015
@@ -28,31 +28,22 @@ import javax.ws.rs.core.HttpHeaders;
import javax.ws.rs.core.MultivaluedMap;
import javax.ws.rs.core.Response;
import javax.ws.rs.core.UriInfo;
-
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.cxf.jaxrs.ext.multipart.Attachment;
-import org.apache.tika.config.TikaConfig;
import org.apache.tika.language.ProfilingHandler;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
-import org.xml.sax.helpers.DefaultHandler;
+import org.apache.tika.parser.Parser;
@Path("/meta")
public class MetadataResource {
private static final Log logger =
LogFactory.getLog(MetadataResource.class);
- private TikaConfig tikaConfig;
-
- public MetadataResource(TikaConfig tikaConfig) {
- this.tikaConfig = tikaConfig;
- }
-
@POST
@Consumes("multipart/form-data")
@Produces({"text/csv", "application/json", "application/rdf+xml"})
@@ -127,7 +118,7 @@ public class MetadataResource {
MultivaluedMap<String, String> httpHeaders,
UriInfo info) throws IOException {
final Metadata metadata = new Metadata();
final ParseContext context = new ParseContext();
- AutoDetectParser parser = TikaResource.createParser(tikaConfig);
+ Parser parser = TikaResource.createParser();
TikaResource.fillMetadata(parser, metadata, context, httpHeaders);
//no need to pass parser for embedded document parsing
TikaResource.fillParseContext(context, httpHeaders, null);