Repository: tika Updated Branches: refs/heads/master c94236a83 -> 01109c8fe
TIKA-1918: make outputSuffix optional in tika-batch Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/34db9359 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/34db9359 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/34db9359 Branch: refs/heads/master Commit: 34db93595c71745e3bccdabc39e72181c03abbbd Parents: 9ebf066 Author: tballison <[email protected]> Authored: Thu Mar 31 11:52:27 2016 -0400 Committer: tballison <[email protected]> Committed: Thu Mar 31 11:52:27 2016 -0400 ---------------------------------------------------------------------- .../tika/cli/BatchCommandLineBuilder.java | 7 -- .../main/resources/tika-app-batch-config.xml | 10 +- .../tika/cli/TikaCLIBatchCommandLineTest.java | 1 - .../builders/BasicTikaFSConsumersBuilder.java | 51 ++++++++- .../tika/batch/fs/default-tika-batch-config.xml | 50 +++++---- .../apache/tika/batch/fs/BatchProcessTest.java | 19 +++- .../tika/batch/fs/HandlerBuilderTest.java | 4 - .../tika-batch-config-MockConsumersBuilder.xml | 2 +- .../test/resources/tika-batch-config-broken.xml | 2 +- .../tika-batch-config-test-suffix-override.xml | 112 +++++++++++++++++++ .../test/resources/tika-batch-config-test.xml | 2 +- .../tika/sax/BasicContentHandlerFactory.java | 8 ++ 12 files changed, 222 insertions(+), 46 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java ---------------------------------------------------------------------- diff --git a/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java b/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java index da44956..2f85546 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java +++ b/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java @@ -160,35 +160,28 @@ class BatchCommandLineBuilder { map.remove("-h"); map.remove("--html"); map.put("-basicHandlerType", "html"); - map.put("-outputSuffix", "html"); } else if (map.containsKey("-x") || map.containsKey("--xml")) { map.remove("-x"); map.remove("--xml"); map.put("-basicHandlerType", "xml"); - map.put("-outputSuffix", "xml"); } else if (map.containsKey("-t") || map.containsKey("--text")) { map.remove("-t"); map.remove("--text"); map.put("-basicHandlerType", "text"); - map.put("-outputSuffix", "txt"); } else if (map.containsKey("-m") || map.containsKey("--metadata")) { map.remove("-m"); map.remove("--metadata"); map.put("-basicHandlerType", "ignore"); - map.put("-outputSuffix", "json"); } else if (map.containsKey("-T") || map.containsKey("--text-main")) { map.remove("-T"); map.remove("--text-main"); map.put("-basicHandlerType", "body"); - map.put("-outputSuffix", "txt"); } if (map.containsKey("-J") || map.containsKey("--jsonRecursive")) { map.remove("-J"); map.remove("--jsonRecursive"); map.put("-recursiveParserWrapper", "true"); - //overwrite outputSuffix - map.put("-outputSuffix", "json"); } if (map.containsKey("--inputDir") || map.containsKey("-i")) { http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-app/src/main/resources/tika-app-batch-config.xml ---------------------------------------------------------------------- diff --git a/tika-app/src/main/resources/tika-app-batch-config.xml b/tika-app/src/main/resources/tika-app-batch-config.xml index e2f1204..99651a1 100644 --- a/tika-app/src/main/resources/tika-app-batch-config.xml +++ b/tika-app/src/main/resources/tika-app-batch-config.xml @@ -124,9 +124,13 @@ digest="md5" digestMarkLimit="1000000"/> <contenthandler builderClass="org.apache.tika.batch.builders.DefaultContentHandlerFactoryBuilder" basicHandlerType="xml" writeLimit="-1"/> - <!-- overwritePolicy: "skip" a file if output file exists, "rename" a output file, "overwrite" --> - <!-- can include e.g. outputDir="output", but we don't want to include this in the default! --> - <outputstream class="FSOutputStreamFactory" encoding="UTF-8" outputSuffix="xml"/> + <!-- can specify custom output file suffix with: + suffix=".mysuffix" + if no suffix is specified, BasicTikaFSConsumersBuilder does its best to guess --> + <!-- can specify compression with + compression="bzip2|gzip|zip" --> + + <outputstream class="FSOutputStreamFactory" encoding="UTF-8"/> </consumers> <!-- reporter and interrupter are optional --> http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java ---------------------------------------------------------------------- diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java index 260273e..e543ccc 100644 --- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java +++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java @@ -113,7 +113,6 @@ public class TikaCLIBatchCommandLineTest { Map<String, String> attrs = mapify(commandLine); assertEquals("true", attrs.get("-recursiveParserWrapper")); assertEquals("html", attrs.get("-basicHandlerType")); - assertEquals("json", attrs.get("-outputSuffix")); assertEquals("batch-config.xml", attrs.get("-bc")); assertEquals(testInputPathForCommandLine, attrs.get("-inputDir")); } http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java ---------------------------------------------------------------------- diff --git a/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java b/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java index b65b046..4879af4 100644 --- a/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java +++ b/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java @@ -41,6 +41,7 @@ import org.apache.tika.batch.fs.FSOutputStreamFactory; import org.apache.tika.batch.fs.FSUtil; import org.apache.tika.batch.fs.RecursiveParserWrapperFSConsumer; import org.apache.tika.config.TikaConfig; +import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.util.ClassLoaderUtil; import org.apache.tika.util.PropsUtil; @@ -125,7 +126,9 @@ public class BasicTikaFSConsumersBuilder extends AbstractConsumersBuilder { } ContentHandlerFactory contentHandlerFactory = getContentHandlerFactory(contentHandlerFactoryNode, runtimeAttributes); ParserFactory parserFactory = getParserFactory(parserFactoryNode, runtimeAttributes); - OutputStreamFactory outputStreamFactory = getOutputStreamFactory(outputStreamFactoryNode, runtimeAttributes); + OutputStreamFactory outputStreamFactory = getOutputStreamFactory( + outputStreamFactoryNode, runtimeAttributes, + contentHandlerFactory, recursiveParserWrapper); if (recursiveParserWrapper) { for (int i = 0; i < numConsumers; i++) { @@ -147,7 +150,6 @@ public class BasicTikaFSConsumersBuilder extends AbstractConsumersBuilder { return manager; } - private ContentHandlerFactory getContentHandlerFactory(Node node, Map<String, String> runtimeAttributes) { Map<String, String> localAttrs = XMLDOMUtil.mapifyAttrs(node, runtimeAttributes); @@ -166,7 +168,10 @@ public class BasicTikaFSConsumersBuilder extends AbstractConsumersBuilder { return builder.build(node, runtimeAttributes); } - private OutputStreamFactory getOutputStreamFactory(Node node, Map<String, String> runtimeAttributes) { + private OutputStreamFactory getOutputStreamFactory(Node node, + Map<String, String> runtimeAttributes, + ContentHandlerFactory contentHandlerFactory, + boolean useRecursiveParserWrapper) { Map<String, String> attrs = XMLDOMUtil.mapifyAttrs(node, runtimeAttributes); Path outputDir = PropsUtil.getPath(attrs.get("outputDir"), null); @@ -196,6 +201,17 @@ public class BasicTikaFSConsumersBuilder extends AbstractConsumersBuilder { compression = FSOutputStreamFactory.COMPRESSION.ZIP; } String suffix = attrs.get("outputSuffix"); + //suffix should not start with "." + if (suffix == null) { + StringBuilder sb = new StringBuilder(); + if (useRecursiveParserWrapper) { + sb.append("json"); + } else if (contentHandlerFactory instanceof BasicContentHandlerFactory) { + appendSuffix(((BasicContentHandlerFactory) contentHandlerFactory).getType(), sb); + } + appendCompression(compression, sb); + suffix = sb.toString(); + } //TODO: possibly open up the different handle-existings in the future //but for now, lock it down to require skip. Too dangerous otherwise @@ -204,4 +220,33 @@ public class BasicTikaFSConsumersBuilder extends AbstractConsumersBuilder { compression, suffix); } + private void appendCompression(FSOutputStreamFactory.COMPRESSION compression, StringBuilder sb) { + switch (compression) { + case NONE: + break; + case ZIP: + sb.append(".zip"); + break; + case BZIP2: + sb.append(".bz2"); + break; + case GZIP: + sb.append(".gz"); + break; + } + } + + private void appendSuffix(BasicContentHandlerFactory.HANDLER_TYPE type, StringBuilder sb) { + switch (type) { + case XML: + sb.append("xml"); + break; + case HTML: + sb.append("html"); + break; + default : + sb.append("txt"); + } + } + } http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml ---------------------------------------------------------------------- diff --git a/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml b/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml index 394c458..1b71152 100644 --- a/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml +++ b/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml @@ -26,13 +26,13 @@ <tika-batch-config maxAliveTimeSeconds="-1" pauseOnEarlyTerminationMillis="10000" - timeoutThresholdMillis="300000" - timeoutCheckPulseMillis="1000" - maxQueueSize="10000" - numConsumers="default"> <!-- numConsumers = number of file consumers, "default" = number of processors -1 --> - - <!-- options to allow on the commandline --> - <commandline> + timeoutThresholdMillis="300000" + timeoutCheckPulseMillis="1000" + maxQueueSize="10000" + numConsumers="default"> <!-- numConsumers = number of file consumers, "default" = number of processors -1 --> + + <!-- options to allow on the commandline --> + <commandline> <option opt="c" longOpt="tika-config" hasArg="true" description="TikaConfig file"/> <option opt="bc" longOpt="batch-config" hasArg="true" @@ -72,14 +72,14 @@ <option opt="timeoutThresholdMillis" hasArg="true" description="how long to wait before determining that a consumer is stale"/> <option opt="includeFilePat" hasArg="true" - description="regex that specifies which files to process"/> - <option opt="excludeFilePat" hasArg="true" - description="regex that specifies which files to avoid processing"/> - <option opt="reporterSleepMillis" hasArg="true" - description="millisecond between reports by the reporter"/> - </commandline> - - + description="regex that specifies which files to process"/> + <option opt="excludeFilePat" hasArg="true" + description="regex that specifies which files to avoid processing"/> + <option opt="reporterSleepMillis" hasArg="true" + description="millisecond between reports by the reporter"/> + </commandline> + + <!-- can specify inputDir="input", but the default config should not include this --> <!-- can also specify startDir="input/someDir" to specify which child directory to start processing --> @@ -116,12 +116,16 @@ parseRecursively="true"/> <contenthandler builderClass="org.apache.tika.batch.builders.DefaultContentHandlerFactoryBuilder" basicHandlerType="xml" writeLimit="-1"/> - <!-- overwritePolicy: "skip" a file if output file exists, "rename" a output file, "overwrite" --> <!-- can include e.g. outputDir="output", but we don't want to include this in the default! --> - <outputstream class="FSOutputStreamFactory" encoding="UTF-8" outputSuffix="xml"/> - </consumers> - - <!-- reporter and interrupter are optional --> - <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" reporterSleepMillis="1000" - reporterStaleThresholdMillis="60000"/> - <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/> + <!-- can specify custom output file suffix with: + suffix=".mysuffix" + if no suffix is specified, BasicTikaFSConsumersBuilder does its best to guess --> + <!-- can specify compression with + compression="bzip2|gzip|zip" --> + <outputstream class="FSOutputStreamFactory" encoding="UTF-8"/> + </consumers> + + <!-- reporter and interrupter are optional --> + <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" reporterSleepMillis="1000" + reporterStaleThresholdMillis="60000"/> + <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/> </tika-batch-config> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchProcessTest.java ---------------------------------------------------------------------- diff --git a/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchProcessTest.java b/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchProcessTest.java index 8cea0b3..d623afb 100644 --- a/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchProcessTest.java +++ b/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchProcessTest.java @@ -276,7 +276,6 @@ public class BatchProcessTest extends FSBatchTestBase { Paths.get(this.getClass().getResource("/testFileList.txt").toURI()).toString()); args.put("recursiveParserWrapper", "true"); args.put("basicHandlerType", "text"); - args.put("outputSuffix", "json"); BatchProcessTestExecutor ex = new BatchProcessTestExecutor(args, "/tika-batch-config-MockConsumersBuilder.xml"); ex.execute(); Path test1 = outputDir.resolve("test1.xml.json"); @@ -302,7 +301,6 @@ public class BatchProcessTest extends FSBatchTestBase { args.put("numConsumers", "1"); args.put("recursiveParserWrapper", "true"); args.put("basicHandlerType", "text"); - args.put("outputSuffix", "json"); BatchProcessTestExecutor ex = new BatchProcessTestExecutor(args, "/tika-batch-config-MockConsumersBuilder.xml", @@ -312,6 +310,23 @@ public class BatchProcessTest extends FSBatchTestBase { assertContains("parse_ex resourceId=\"test0_bad_chars.xml\"", ss.getOutString()); } + @Test + public void testOverrideOutputSuffix() throws Exception { + Path outputDir = getNewOutputDir("outputSuffixTest"); + + Map<String, String> args = getDefaultArgs("basic", outputDir); + args.put("numConsumers", "1"); + args.put("recursiveParserWrapper", "true"); + args.put("basicHandlerType", "text"); + + BatchProcessTestExecutor ex = new BatchProcessTestExecutor(args, + "/tika-batch-config-test-suffix-override.xml", + "/log4j-on.properties"); + ex.execute(); + Path targ = outputDir.resolve("test0.xml.mysuffix"); + assertTrue(Files.isRegularFile(targ)); + } + private class BatchProcessTestExecutor { private final Map<String, String> args; private final String configPath; http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-batch/src/test/java/org/apache/tika/batch/fs/HandlerBuilderTest.java ---------------------------------------------------------------------- diff --git a/tika-batch/src/test/java/org/apache/tika/batch/fs/HandlerBuilderTest.java b/tika-batch/src/test/java/org/apache/tika/batch/fs/HandlerBuilderTest.java index d8aecad..6e3648a 100644 --- a/tika-batch/src/test/java/org/apache/tika/batch/fs/HandlerBuilderTest.java +++ b/tika-batch/src/test/java/org/apache/tika/batch/fs/HandlerBuilderTest.java @@ -36,7 +36,6 @@ public class HandlerBuilderTest extends FSBatchTestBase { Path outputDir = getNewOutputDir("handler-xml-"); Map<String, String> args = getDefaultArgs("basic", outputDir); args.put("basicHandlerType", "xml"); - args.put("outputSuffix", "xml"); BatchProcess runner = getNewBatchRunner("/tika-batch-config-test.xml", args); ParallelFileProcessingResult result = run(runner); @@ -54,7 +53,6 @@ public class HandlerBuilderTest extends FSBatchTestBase { Map<String, String> args = getDefaultArgs("basic", outputDir); args.put("basicHandlerType", "html"); - args.put("outputSuffix", "html"); BatchProcess runner = getNewBatchRunner("/tika-batch-config-test.xml", args); ParallelFileProcessingResult result = run(runner); Path outputFile = outputDir.resolve("test0.xml.html"); @@ -70,7 +68,6 @@ public class HandlerBuilderTest extends FSBatchTestBase { Map<String, String> args = getDefaultArgs("basic", outputDir); args.put("basicHandlerType", "txt"); - args.put("outputSuffix", "txt"); BatchProcess runner = getNewBatchRunner("/tika-batch-config-test.xml", args); ParallelFileProcessingResult result = run(runner); @@ -105,7 +102,6 @@ public class HandlerBuilderTest extends FSBatchTestBase { Map<String, String> args = getDefaultArgs("basic", outputDir); args.put("basicHandlerType", "txt"); - args.put("outputSuffix", "json"); args.put("recursiveParserWrapper", "true"); BatchProcess runner = getNewBatchRunner("/tika-batch-config-test.xml", args); http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml ---------------------------------------------------------------------- diff --git a/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml b/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml index a2915cf..8da44be 100644 --- a/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml +++ b/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml @@ -103,7 +103,7 @@ basicHandlerType="xml" writeLimit="-1"/> <outputstream class="FSOutputStreamFactory" - encoding="UTF-8" outputSuffix="xml"/> + encoding="UTF-8"/> </consumers> <!-- reporter and interrupter are optional --> http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-batch/src/test/resources/tika-batch-config-broken.xml ---------------------------------------------------------------------- diff --git a/tika-batch/src/test/resources/tika-batch-config-broken.xml b/tika-batch/src/test/resources/tika-batch-config-broken.xml index 1d599b4..5b8490e 100644 --- a/tika-batch/src/test/resources/tika-batch-config-broken.xml +++ b/tika-batch/src/test/resources/tika-batch-config-broken.xml @@ -97,7 +97,7 @@ basicHandlerType="xml" writeLimit="-1"/> <outputstream class="FSOutputStreamFactory" - encoding="UTF-8" outputSuffix="xml"/> + encoding="UTF-8"/> </consumers> <!-- reporter and interrupter are optional --> http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-batch/src/test/resources/tika-batch-config-test-suffix-override.xml ---------------------------------------------------------------------- diff --git a/tika-batch/src/test/resources/tika-batch-config-test-suffix-override.xml b/tika-batch/src/test/resources/tika-batch-config-test-suffix-override.xml new file mode 100644 index 0000000..911398f --- /dev/null +++ b/tika-batch/src/test/resources/tika-batch-config-test-suffix-override.xml @@ -0,0 +1,112 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no" ?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> +<!-- NOTE: tika-batch is still an experimental feature. + The configuration file will likely change and be backward incompatible + with new versions of Tika. Please stay tuned. + --> +<tika-batch-config + maxAliveTimeSeconds="-1" + pauseOnEarlyTerminationMillis="500" + timeoutThresholdMillis="3000" + timeoutCheckPulseMillis="1000" + maxQueueSize="10000" + numConsumers="3"> + <!-- options to allow on the commandline --> + <commandline> + <option opt="c" longOpt="tika-config" hasArg="true" + description="TikaConfig file"/> + <option opt="bc" longOpt="batch-config" hasArg="true" + description="xml batch config file" required="true"/> + <!-- We needed sorted for testing. We added random for performance. + Where crawling a directory is slow, it might be beneficial to + go randomly so that the parsers are triggered earlier. The + default is operating system's choice ("os") which means whatever order + the os returns files in .listFiles(). --> + <option opt="crawlOrder" hasArg="true" + description="how does the crawler sort the directories and files: + (random|sorted|os)"/> + <option opt="numConsumers" hasArg="true" + description="number of fileConsumers threads"/> + <option opt="minFileSizeBytes" hasArg="true" + description="minimum file size to process; do not process files smaller than this"/> + <option opt="maxFileSizeBytes" hasArg="true" + description="maximum file size to process; do not process files larger than this"/> + <option opt="maxQueueSize" hasArg="true" + description="maximum queue size for FileResources"/> + <option opt="fileList" hasArg="true" + description="file that contains a list of files (relative to inputDir) to process"/> + <option opt="fileListEncoding" hasArg="true" + description="encoding for fileList"/> + <option opt="inputDir" hasArg="true" + description="root directory for the files to be processed" + required="true"/> + <option opt="startDir" hasArg="true" + description="directory (under inputDir) at which to start crawling"/> + <option opt="outputDir" hasArg="true" + description="output directory" + required="true"/> + <option opt="recursiveParserWrapper" + description="use the RecursiveParserWrapper or not (default = false)"/> + <option opt="handleExisting" hasArg="true" + description="if an output file already exists, do you want to: overwrite, rename or skip"/> + <option opt="basicHandlerType" hasArg="true" + description="what type of content handler: xml, text, html, body"/> + <option opt="outputSuffix" hasArg="true" + description="suffix to add to the end of the output file name"/> + <option opt="timeoutThresholdMillis" hasArg="true" + description="how long to wait before determining that a consumer should be timed out"/> + <option opt="pauseOnEarlyTerminationMillis" hasArg="true" + description="how long to wait for parsers to finish if there is an early termination from the main loop."/> + <!-- in long running process, might be good to restart every hour or so to avoid memory leaks--> + <option opt="maxAliveTimeSeconds" hasArg="true" + description="how long should this process run in seconds."/> + </commandline> + <!-- + Can also add startDir: this tells the crawler to start indexing a + child directory of the inputDir directory. + --> + <crawler builderClass="org.apache.tika.batch.fs.builders.FSCrawlerBuilder" + crawlOrder="sorted" + maxConsecWaitMillis="5000" + maxFilesToAdd="-1" + maxFilesToConsider="-1" + includeFilePat="" + excludeFilePat="" + maxFileSizeBytes="-1" + /> +<!-- inputDir="tika-batch/src/test/resources/test-input" --> + + <consumers builderClass="org.apache.tika.batch.fs.builders.BasicTikaFSConsumersBuilder" + recursiveParserWrapper="false" consumersManagerMaxMillis="120000"> + <parser builderClass="org.apache.tika.batch.builders.ParserFactoryBuilder" + class="org.apache.tika.parser.mock.MockParserFactory" + parseRecursively="true"/> + <contenthandler builderClass="org.apache.tika.batch.builders.DefaultContentHandlerFactoryBuilder" + basicHandlerType="xml" writeLimit="-1"/> + + <outputstream class="FSOutputStreamFactory" + encoding="UTF-8" outputSuffix="mysuffix"/> + </consumers> + + <!-- reporter and interrupter are optional --> + <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" reporterSleepMillis="1000" + reporterStaleThresholdMillis="500000"/> + <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/> +</tika-batch-config> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-batch/src/test/resources/tika-batch-config-test.xml ---------------------------------------------------------------------- diff --git a/tika-batch/src/test/resources/tika-batch-config-test.xml b/tika-batch/src/test/resources/tika-batch-config-test.xml index cf71fd6..755eb58 100644 --- a/tika-batch/src/test/resources/tika-batch-config-test.xml +++ b/tika-batch/src/test/resources/tika-batch-config-test.xml @@ -102,7 +102,7 @@ basicHandlerType="xml" writeLimit="-1"/> <outputstream class="FSOutputStreamFactory" - encoding="UTF-8" outputSuffix="xml"/> + encoding="UTF-8"/> </consumers> <!-- reporter and interrupter are optional --> http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java index 810b72e..c611f09 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java +++ b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java @@ -153,4 +153,12 @@ public class BasicContentHandlerFactory implements ContentHandlerFactory { } } + /** + * + * @return handler type used by this factory + */ + public HANDLER_TYPE getType() { + return type; + } + }
