Repository: tika
Updated Branches:
  refs/heads/master c94236a83 -> 01109c8fe


TIKA-1918: make outputSuffix optional in tika-batch


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/34db9359
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/34db9359
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/34db9359

Branch: refs/heads/master
Commit: 34db93595c71745e3bccdabc39e72181c03abbbd
Parents: 9ebf066
Author: tballison <[email protected]>
Authored: Thu Mar 31 11:52:27 2016 -0400
Committer: tballison <[email protected]>
Committed: Thu Mar 31 11:52:27 2016 -0400

----------------------------------------------------------------------
 .../tika/cli/BatchCommandLineBuilder.java       |   7 --
 .../main/resources/tika-app-batch-config.xml    |  10 +-
 .../tika/cli/TikaCLIBatchCommandLineTest.java   |   1 -
 .../builders/BasicTikaFSConsumersBuilder.java   |  51 ++++++++-
 .../tika/batch/fs/default-tika-batch-config.xml |  50 +++++----
 .../apache/tika/batch/fs/BatchProcessTest.java  |  19 +++-
 .../tika/batch/fs/HandlerBuilderTest.java       |   4 -
 .../tika-batch-config-MockConsumersBuilder.xml  |   2 +-
 .../test/resources/tika-batch-config-broken.xml |   2 +-
 .../tika-batch-config-test-suffix-override.xml  | 112 +++++++++++++++++++
 .../test/resources/tika-batch-config-test.xml   |   2 +-
 .../tika/sax/BasicContentHandlerFactory.java    |   8 ++
 12 files changed, 222 insertions(+), 46 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java
----------------------------------------------------------------------
diff --git 
a/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java 
b/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java
index da44956..2f85546 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java
@@ -160,35 +160,28 @@ class BatchCommandLineBuilder {
             map.remove("-h");
             map.remove("--html");
             map.put("-basicHandlerType", "html");
-            map.put("-outputSuffix", "html");
         } else if (map.containsKey("-x") || map.containsKey("--xml")) {
             map.remove("-x");
             map.remove("--xml");
             map.put("-basicHandlerType", "xml");
-            map.put("-outputSuffix", "xml");
         } else if (map.containsKey("-t") || map.containsKey("--text")) {
             map.remove("-t");
             map.remove("--text");
             map.put("-basicHandlerType", "text");
-            map.put("-outputSuffix", "txt");
         } else if (map.containsKey("-m") || map.containsKey("--metadata")) {
             map.remove("-m");
             map.remove("--metadata");
             map.put("-basicHandlerType", "ignore");
-            map.put("-outputSuffix", "json");
         } else if (map.containsKey("-T") || map.containsKey("--text-main")) {
             map.remove("-T");
             map.remove("--text-main");
             map.put("-basicHandlerType", "body");
-            map.put("-outputSuffix", "txt");
         }
 
         if (map.containsKey("-J") || map.containsKey("--jsonRecursive")) {
             map.remove("-J");
             map.remove("--jsonRecursive");
             map.put("-recursiveParserWrapper", "true");
-            //overwrite outputSuffix
-            map.put("-outputSuffix", "json");
         }
 
         if (map.containsKey("--inputDir") || map.containsKey("-i")) {

http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-app/src/main/resources/tika-app-batch-config.xml
----------------------------------------------------------------------
diff --git a/tika-app/src/main/resources/tika-app-batch-config.xml 
b/tika-app/src/main/resources/tika-app-batch-config.xml
index e2f1204..99651a1 100644
--- a/tika-app/src/main/resources/tika-app-batch-config.xml
+++ b/tika-app/src/main/resources/tika-app-batch-config.xml
@@ -124,9 +124,13 @@
                 digest="md5" digestMarkLimit="1000000"/>
         <contenthandler 
builderClass="org.apache.tika.batch.builders.DefaultContentHandlerFactoryBuilder"
                         basicHandlerType="xml" writeLimit="-1"/>
-        <!-- overwritePolicy: "skip" a file if output file exists, "rename" a 
output file, "overwrite" -->
-        <!-- can include e.g. outputDir="output", but we don't want to include 
this in the default! -->
-        <outputstream class="FSOutputStreamFactory" encoding="UTF-8" 
outputSuffix="xml"/>
+        <!-- can specify custom output file suffix with:
+            suffix=".mysuffix"
+            if no suffix is specified, BasicTikaFSConsumersBuilder does its 
best to guess -->
+        <!-- can specify compression with
+            compression="bzip2|gzip|zip" -->
+
+        <outputstream class="FSOutputStreamFactory" encoding="UTF-8"/>
     </consumers>
 
     <!-- reporter and interrupter are optional -->

http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java
----------------------------------------------------------------------
diff --git 
a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java 
b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java
index 260273e..e543ccc 100644
--- 
a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java
+++ 
b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchCommandLineTest.java
@@ -113,7 +113,6 @@ public class TikaCLIBatchCommandLineTest {
         Map<String, String> attrs = mapify(commandLine);
         assertEquals("true", attrs.get("-recursiveParserWrapper"));
         assertEquals("html", attrs.get("-basicHandlerType"));
-        assertEquals("json", attrs.get("-outputSuffix"));
         assertEquals("batch-config.xml", attrs.get("-bc"));
         assertEquals(testInputPathForCommandLine, attrs.get("-inputDir"));
     }

http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java
----------------------------------------------------------------------
diff --git 
a/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java
 
b/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java
index b65b046..4879af4 100644
--- 
a/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java
+++ 
b/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java
@@ -41,6 +41,7 @@ import org.apache.tika.batch.fs.FSOutputStreamFactory;
 import org.apache.tika.batch.fs.FSUtil;
 import org.apache.tika.batch.fs.RecursiveParserWrapperFSConsumer;
 import org.apache.tika.config.TikaConfig;
+import org.apache.tika.sax.BasicContentHandlerFactory;
 import org.apache.tika.sax.ContentHandlerFactory;
 import org.apache.tika.util.ClassLoaderUtil;
 import org.apache.tika.util.PropsUtil;
@@ -125,7 +126,9 @@ public class BasicTikaFSConsumersBuilder extends 
AbstractConsumersBuilder {
         }
         ContentHandlerFactory contentHandlerFactory = 
getContentHandlerFactory(contentHandlerFactoryNode, runtimeAttributes);
         ParserFactory parserFactory = getParserFactory(parserFactoryNode, 
runtimeAttributes);
-        OutputStreamFactory outputStreamFactory = 
getOutputStreamFactory(outputStreamFactoryNode, runtimeAttributes);
+        OutputStreamFactory outputStreamFactory = getOutputStreamFactory(
+                outputStreamFactoryNode, runtimeAttributes,
+                contentHandlerFactory, recursiveParserWrapper);
 
         if (recursiveParserWrapper) {
             for (int i = 0; i < numConsumers; i++) {
@@ -147,7 +150,6 @@ public class BasicTikaFSConsumersBuilder extends 
AbstractConsumersBuilder {
         return manager;
     }
 
-
     private ContentHandlerFactory getContentHandlerFactory(Node node, 
Map<String, String> runtimeAttributes) {
 
         Map<String, String> localAttrs = XMLDOMUtil.mapifyAttrs(node, 
runtimeAttributes);
@@ -166,7 +168,10 @@ public class BasicTikaFSConsumersBuilder extends 
AbstractConsumersBuilder {
         return builder.build(node, runtimeAttributes);
     }
 
-    private OutputStreamFactory getOutputStreamFactory(Node node, Map<String, 
String> runtimeAttributes) {
+    private OutputStreamFactory getOutputStreamFactory(Node node,
+                                                       Map<String, String> 
runtimeAttributes,
+                                                       ContentHandlerFactory 
contentHandlerFactory,
+                                                       boolean 
useRecursiveParserWrapper) {
         Map<String, String> attrs = XMLDOMUtil.mapifyAttrs(node, 
runtimeAttributes);
 
         Path outputDir = PropsUtil.getPath(attrs.get("outputDir"), null);
@@ -196,6 +201,17 @@ public class BasicTikaFSConsumersBuilder extends 
AbstractConsumersBuilder {
             compression = FSOutputStreamFactory.COMPRESSION.ZIP;
         }
         String suffix = attrs.get("outputSuffix");
+        //suffix should not start with "."
+        if (suffix == null) {
+            StringBuilder sb = new StringBuilder();
+            if (useRecursiveParserWrapper) {
+                sb.append("json");
+            } else if (contentHandlerFactory instanceof 
BasicContentHandlerFactory) {
+                appendSuffix(((BasicContentHandlerFactory) 
contentHandlerFactory).getType(), sb);
+            }
+            appendCompression(compression, sb);
+            suffix = sb.toString();
+        }
 
         //TODO: possibly open up the different handle-existings in the future
         //but for now, lock it down to require skip.  Too dangerous otherwise
@@ -204,4 +220,33 @@ public class BasicTikaFSConsumersBuilder extends 
AbstractConsumersBuilder {
                 compression, suffix);
     }
 
+    private void appendCompression(FSOutputStreamFactory.COMPRESSION 
compression, StringBuilder sb) {
+        switch (compression) {
+            case NONE:
+                break;
+            case ZIP:
+                sb.append(".zip");
+                break;
+            case BZIP2:
+                sb.append(".bz2");
+                break;
+            case GZIP:
+                sb.append(".gz");
+                break;
+        }
+    }
+
+    private void appendSuffix(BasicContentHandlerFactory.HANDLER_TYPE type, 
StringBuilder sb) {
+        switch (type) {
+            case XML:
+                sb.append("xml");
+                break;
+            case HTML:
+                sb.append("html");
+                break;
+            default :
+                sb.append("txt");
+        }
+    }
+
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml
----------------------------------------------------------------------
diff --git 
a/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml
 
b/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml
index 394c458..1b71152 100644
--- 
a/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml
+++ 
b/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml
@@ -26,13 +26,13 @@
 <tika-batch-config
         maxAliveTimeSeconds="-1"
         pauseOnEarlyTerminationMillis="10000"
-        timeoutThresholdMillis="300000"
-        timeoutCheckPulseMillis="1000"
-        maxQueueSize="10000"
-        numConsumers="default"> <!-- numConsumers = number of file consumers, 
"default" = number of processors -1 -->
-
-    <!-- options to allow on the commandline -->
-    <commandline>
+        timeoutThresholdMillis="300000"
+        timeoutCheckPulseMillis="1000"
+        maxQueueSize="10000"
+        numConsumers="default"> <!-- numConsumers = number of file consumers, 
"default" = number of processors -1 -->
+
+    <!-- options to allow on the commandline -->
+    <commandline>
         <option opt="c" longOpt="tika-config" hasArg="true"
                 description="TikaConfig file"/>
         <option opt="bc" longOpt="batch-config" hasArg="true"
@@ -72,14 +72,14 @@
         <option opt="timeoutThresholdMillis" hasArg="true"
                 description="how long to wait before determining that a 
consumer is stale"/>
         <option opt="includeFilePat" hasArg="true"
-                description="regex that specifies which files to process"/>
-        <option opt="excludeFilePat" hasArg="true"
-                description="regex that specifies which files to avoid 
processing"/>
-        <option opt="reporterSleepMillis" hasArg="true"
-                description="millisecond between reports by the reporter"/>
-    </commandline>
-
-
+                description="regex that specifies which files to process"/>
+        <option opt="excludeFilePat" hasArg="true"
+                description="regex that specifies which files to avoid 
processing"/>
+        <option opt="reporterSleepMillis" hasArg="true"
+                description="millisecond between reports by the reporter"/>
+    </commandline>
+
+
     <!-- can specify inputDir="input", but the default config should not 
include this -->
     <!-- can also specify startDir="input/someDir" to specify which child 
directory
          to start processing -->
@@ -116,12 +116,16 @@
                 parseRecursively="true"/>
         <contenthandler 
builderClass="org.apache.tika.batch.builders.DefaultContentHandlerFactoryBuilder"
                         basicHandlerType="xml" writeLimit="-1"/>
-        <!-- overwritePolicy: "skip" a file if output file exists, "rename" a 
output file, "overwrite" -->        <!-- can include e.g. outputDir="output", 
but we don't want to include this in the default! -->
-        <outputstream class="FSOutputStreamFactory" encoding="UTF-8" 
outputSuffix="xml"/>
-    </consumers>
-
-    <!-- reporter and interrupter are optional -->
-    <reporter 
builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" 
reporterSleepMillis="1000"
-              reporterStaleThresholdMillis="60000"/>
-    <interrupter 
builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
+        <!-- can specify custom output file suffix with:
+            suffix=".mysuffix"
+            if no suffix is specified, BasicTikaFSConsumersBuilder does its 
best to guess -->
+        <!-- can specify compression with
+            compression="bzip2|gzip|zip" -->
+        <outputstream class="FSOutputStreamFactory" encoding="UTF-8"/>
+    </consumers>
+
+    <!-- reporter and interrupter are optional -->
+    <reporter 
builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" 
reporterSleepMillis="1000"
+              reporterStaleThresholdMillis="60000"/>
+    <interrupter 
builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
 </tika-batch-config>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchProcessTest.java
----------------------------------------------------------------------
diff --git 
a/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchProcessTest.java 
b/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchProcessTest.java
index 8cea0b3..d623afb 100644
--- a/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchProcessTest.java
+++ b/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchProcessTest.java
@@ -276,7 +276,6 @@ public class BatchProcessTest extends FSBatchTestBase {
                 
Paths.get(this.getClass().getResource("/testFileList.txt").toURI()).toString());
         args.put("recursiveParserWrapper", "true");
         args.put("basicHandlerType", "text");
-        args.put("outputSuffix", "json");
         BatchProcessTestExecutor ex = new BatchProcessTestExecutor(args, 
"/tika-batch-config-MockConsumersBuilder.xml");
         ex.execute();
         Path test1 = outputDir.resolve("test1.xml.json");
@@ -302,7 +301,6 @@ public class BatchProcessTest extends FSBatchTestBase {
         args.put("numConsumers", "1");
         args.put("recursiveParserWrapper", "true");
         args.put("basicHandlerType", "text");
-        args.put("outputSuffix", "json");
 
         BatchProcessTestExecutor ex = new BatchProcessTestExecutor(args,
                 "/tika-batch-config-MockConsumersBuilder.xml",
@@ -312,6 +310,23 @@ public class BatchProcessTest extends FSBatchTestBase {
         assertContains("parse_ex resourceId=\"test0_bad_chars.xml\"", 
ss.getOutString());
     }
 
+    @Test
+    public void testOverrideOutputSuffix() throws Exception {
+        Path outputDir = getNewOutputDir("outputSuffixTest");
+
+        Map<String, String> args = getDefaultArgs("basic", outputDir);
+        args.put("numConsumers", "1");
+        args.put("recursiveParserWrapper", "true");
+        args.put("basicHandlerType", "text");
+
+        BatchProcessTestExecutor ex = new BatchProcessTestExecutor(args,
+                "/tika-batch-config-test-suffix-override.xml",
+                "/log4j-on.properties");
+        ex.execute();
+        Path targ = outputDir.resolve("test0.xml.mysuffix");
+        assertTrue(Files.isRegularFile(targ));
+    }
+
     private class BatchProcessTestExecutor {
         private final Map<String, String> args;
         private final String configPath;

http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-batch/src/test/java/org/apache/tika/batch/fs/HandlerBuilderTest.java
----------------------------------------------------------------------
diff --git 
a/tika-batch/src/test/java/org/apache/tika/batch/fs/HandlerBuilderTest.java 
b/tika-batch/src/test/java/org/apache/tika/batch/fs/HandlerBuilderTest.java
index d8aecad..6e3648a 100644
--- a/tika-batch/src/test/java/org/apache/tika/batch/fs/HandlerBuilderTest.java
+++ b/tika-batch/src/test/java/org/apache/tika/batch/fs/HandlerBuilderTest.java
@@ -36,7 +36,6 @@ public class HandlerBuilderTest extends FSBatchTestBase {
         Path outputDir = getNewOutputDir("handler-xml-");
         Map<String, String> args = getDefaultArgs("basic", outputDir);
         args.put("basicHandlerType", "xml");
-        args.put("outputSuffix", "xml");
 
         BatchProcess runner = getNewBatchRunner("/tika-batch-config-test.xml", 
args);
         ParallelFileProcessingResult result = run(runner);
@@ -54,7 +53,6 @@ public class HandlerBuilderTest extends FSBatchTestBase {
 
         Map<String, String> args = getDefaultArgs("basic", outputDir);
         args.put("basicHandlerType", "html");
-        args.put("outputSuffix", "html");
         BatchProcess runner = getNewBatchRunner("/tika-batch-config-test.xml", 
args);
         ParallelFileProcessingResult result = run(runner);
         Path outputFile = outputDir.resolve("test0.xml.html");
@@ -70,7 +68,6 @@ public class HandlerBuilderTest extends FSBatchTestBase {
 
         Map<String, String> args = getDefaultArgs("basic", outputDir);
         args.put("basicHandlerType", "txt");
-        args.put("outputSuffix", "txt");
 
         BatchProcess runner = getNewBatchRunner("/tika-batch-config-test.xml", 
args);
         ParallelFileProcessingResult result = run(runner);
@@ -105,7 +102,6 @@ public class HandlerBuilderTest extends FSBatchTestBase {
 
         Map<String, String> args = getDefaultArgs("basic", outputDir);
         args.put("basicHandlerType", "txt");
-        args.put("outputSuffix", "json");
         args.put("recursiveParserWrapper", "true");
 
         BatchProcess runner = getNewBatchRunner("/tika-batch-config-test.xml", 
args);

http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml
----------------------------------------------------------------------
diff --git 
a/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml 
b/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml
index a2915cf..8da44be 100644
--- a/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml
+++ b/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml
@@ -103,7 +103,7 @@
                         basicHandlerType="xml" writeLimit="-1"/>
 
                <outputstream class="FSOutputStreamFactory"
-                encoding="UTF-8" outputSuffix="xml"/>
+                encoding="UTF-8"/>
        </consumers>
        
        <!-- reporter and interrupter are optional -->

http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-batch/src/test/resources/tika-batch-config-broken.xml
----------------------------------------------------------------------
diff --git a/tika-batch/src/test/resources/tika-batch-config-broken.xml 
b/tika-batch/src/test/resources/tika-batch-config-broken.xml
index 1d599b4..5b8490e 100644
--- a/tika-batch/src/test/resources/tika-batch-config-broken.xml
+++ b/tika-batch/src/test/resources/tika-batch-config-broken.xml
@@ -97,7 +97,7 @@
                         basicHandlerType="xml" writeLimit="-1"/>
 
                <outputstream class="FSOutputStreamFactory"
-                encoding="UTF-8" outputSuffix="xml"/>
+                encoding="UTF-8"/>
        </consumers>
        
        <!-- reporter and interrupter are optional -->

http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-batch/src/test/resources/tika-batch-config-test-suffix-override.xml
----------------------------------------------------------------------
diff --git 
a/tika-batch/src/test/resources/tika-batch-config-test-suffix-override.xml 
b/tika-batch/src/test/resources/tika-batch-config-test-suffix-override.xml
new file mode 100644
index 0000000..911398f
--- /dev/null
+++ b/tika-batch/src/test/resources/tika-batch-config-test-suffix-override.xml
@@ -0,0 +1,112 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<!-- NOTE: tika-batch is still an experimental feature.
+    The configuration file will likely change and be backward incompatible
+    with new versions of Tika.  Please stay tuned.
+    -->
+<tika-batch-config
+        maxAliveTimeSeconds="-1"
+        pauseOnEarlyTerminationMillis="500"
+        timeoutThresholdMillis="3000"
+        timeoutCheckPulseMillis="1000"
+        maxQueueSize="10000"
+        numConsumers="3">
+    <!-- options to allow on the commandline -->
+    <commandline>
+        <option opt="c" longOpt="tika-config" hasArg="true"
+                description="TikaConfig file"/>
+        <option opt="bc" longOpt="batch-config" hasArg="true"
+                description="xml batch config file" required="true"/>
+        <!-- We needed sorted for testing.  We added random for performance.
+             Where crawling a directory is slow, it might be beneficial to
+             go randomly so that the parsers are triggered earlier.  The
+             default is operating system's choice ("os") which means whatever 
order
+             the os returns files in .listFiles(). -->
+        <option opt="crawlOrder" hasArg="true"
+                description="how does the crawler sort the directories and 
files:
+                                (random|sorted|os)"/>
+        <option opt="numConsumers" hasArg="true"
+                description="number of fileConsumers threads"/>
+        <option opt="minFileSizeBytes" hasArg="true"
+                description="minimum file size to process; do not process 
files smaller than this"/>
+        <option opt="maxFileSizeBytes" hasArg="true"
+                description="maximum file size to process; do not process 
files larger than this"/>
+        <option opt="maxQueueSize" hasArg="true"
+                description="maximum queue size for FileResources"/>
+        <option opt="fileList" hasArg="true"
+                description="file that contains a list of files (relative to 
inputDir) to process"/>
+        <option opt="fileListEncoding" hasArg="true"
+                description="encoding for fileList"/>
+        <option opt="inputDir" hasArg="true"
+                description="root directory for the files to be processed"
+                required="true"/>
+        <option opt="startDir" hasArg="true"
+                description="directory (under inputDir) at which to start 
crawling"/>
+        <option opt="outputDir" hasArg="true"
+                description="output directory"
+                required="true"/>
+        <option opt="recursiveParserWrapper"
+                description="use the RecursiveParserWrapper or not (default = 
false)"/>
+        <option opt="handleExisting" hasArg="true"
+                description="if an output file already exists, do you want to: 
overwrite, rename or skip"/>
+        <option opt="basicHandlerType" hasArg="true"
+                description="what type of content handler: xml, text, html, 
body"/>
+        <option opt="outputSuffix" hasArg="true"
+                description="suffix to add to the end of the output file 
name"/>
+        <option opt="timeoutThresholdMillis" hasArg="true"
+                description="how long to wait before determining that a 
consumer should be timed out"/>
+        <option opt="pauseOnEarlyTerminationMillis" hasArg="true"
+                description="how long to wait for parsers to finish if there 
is an early termination from the main loop."/>
+        <!-- in long running process, might be good to restart every hour or 
so to avoid memory leaks-->
+        <option opt="maxAliveTimeSeconds" hasArg="true"
+                description="how long should this process run in seconds."/>
+    </commandline>
+    <!--
+        Can also add startDir: this tells the crawler to start indexing a
+        child directory of the inputDir directory.
+    -->
+       <crawler 
builderClass="org.apache.tika.batch.fs.builders.FSCrawlerBuilder"
+        crawlOrder="sorted"
+        maxConsecWaitMillis="5000"
+        maxFilesToAdd="-1"
+               maxFilesToConsider="-1" 
+               includeFilePat=""
+               excludeFilePat=""
+               maxFileSizeBytes="-1"
+        />
+<!--        inputDir="tika-batch/src/test/resources/test-input" -->
+
+       <consumers 
builderClass="org.apache.tika.batch.fs.builders.BasicTikaFSConsumersBuilder"
+               recursiveParserWrapper="false" 
consumersManagerMaxMillis="120000">
+        <parser 
builderClass="org.apache.tika.batch.builders.ParserFactoryBuilder"
+                class="org.apache.tika.parser.mock.MockParserFactory"
+                parseRecursively="true"/>
+               <contenthandler 
builderClass="org.apache.tika.batch.builders.DefaultContentHandlerFactoryBuilder"
+                        basicHandlerType="xml" writeLimit="-1"/>
+
+               <outputstream class="FSOutputStreamFactory"
+                encoding="UTF-8" outputSuffix="mysuffix"/>
+       </consumers>
+       
+       <!-- reporter and interrupter are optional -->
+       <reporter 
builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" 
reporterSleepMillis="1000"
+              reporterStaleThresholdMillis="500000"/>
+       <interrupter 
builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
+</tika-batch-config>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-batch/src/test/resources/tika-batch-config-test.xml
----------------------------------------------------------------------
diff --git a/tika-batch/src/test/resources/tika-batch-config-test.xml 
b/tika-batch/src/test/resources/tika-batch-config-test.xml
index cf71fd6..755eb58 100644
--- a/tika-batch/src/test/resources/tika-batch-config-test.xml
+++ b/tika-batch/src/test/resources/tika-batch-config-test.xml
@@ -102,7 +102,7 @@
                         basicHandlerType="xml" writeLimit="-1"/>
 
                <outputstream class="FSOutputStreamFactory"
-                encoding="UTF-8" outputSuffix="xml"/>
+                encoding="UTF-8"/>
        </consumers>
        
        <!-- reporter and interrupter are optional -->

http://git-wip-us.apache.org/repos/asf/tika/blob/34db9359/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
----------------------------------------------------------------------
diff --git 
a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java 
b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
index 810b72e..c611f09 100644
--- 
a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
+++ 
b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
@@ -153,4 +153,12 @@ public class BasicContentHandlerFactory implements 
ContentHandlerFactory {
         }
     }
 
+    /**
+     *
+     * @return handler type used by this factory
+     */
+    public HANDLER_TYPE getType() {
+        return type;
+    }
+
 }

Reply via email to