Author: tallison
Date: Wed Apr 1 18:27:23 2015
New Revision: 1670749
URL: http://svn.apache.org/r1670749
Log:
TIKA-1330 clean up logging in tika-batch ant tika-app integration of tika-batch
Added:
tika/trunk/tika-app/src/main/resources/log4j_batch_process.properties
tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java
tika/trunk/tika-app/src/test/resources/log4j_batch_process_test.properties
Modified:
tika/trunk/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java
tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcess.java
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcessDriverCLI.java
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/FileResourceConsumer.java
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/ParallelFileProcessingResult.java
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/AbstractFSConsumer.java
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/BasicTikaFSConsumer.java
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/FSBatchProcessCLI.java
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java
tika/trunk/tika-batch/src/test/resources/log4j.properties
tika/trunk/tika-batch/src/test/resources/log4j_process.properties
Modified:
tika/trunk/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java?rev=1670749&r1=1670748&r2=1670749&view=diff
==============================================================================
---
tika/trunk/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java
(original)
+++
tika/trunk/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java
Wed Apr 1 18:27:23 2015
@@ -39,13 +39,13 @@ class BatchCommandLineBuilder {
static Pattern JVM_OPTS_PATTERN = Pattern.compile("^(--?)J(.+)");
protected static String[] build(String[] args) throws IOException {
- Map<String, String> processArgs = new LinkedHashMap<String, String>();
- Map<String, String> jvmOpts = new LinkedHashMap<String,String>();
- //take the args, and divide them into process args and options for
- //the parent jvm process (i.e. log files, etc)
- mapifyArgs(args, processArgs, jvmOpts);
-
- //now modify processArgs in place
+ Map<String, String> processArgs = new LinkedHashMap<String, String>();
+ Map<String, String> jvmOpts = new LinkedHashMap<String,String>();
+ //take the args, and divide them into process args and options for
+ //the child jvm process (i.e. log files, etc)
+ mapifyArgs(args, processArgs, jvmOpts);
+
+ //now modify processArgs in place
translateCommandLine(args, processArgs);
//maybe the user specified a different classpath?!
@@ -56,12 +56,23 @@ class BatchCommandLineBuilder {
if (cp.contains(" ")){
cp = "\""+cp+"\"";
}
- jvmOpts.put("-cp", cp);
- }
-
- //now build the full command line
- List<String> fullCommand = new ArrayList<String>();
- fullCommand.add("java");
+ jvmOpts.put("-cp", cp);
+ }
+
+ boolean hasLog4j = false;
+ for (String k : jvmOpts.keySet()) {
+ if (k.startsWith("-Dlog4j.configuration=")) {
+ hasLog4j = true;
+ break;
+ }
+ }
+ //use the log4j config file inside the app
/resources/log4j_batch_process.properties
+ if (! hasLog4j) {
+
jvmOpts.put("-Dlog4j.configuration=\"log4j_batch_process.properties\"", "");
+ }
+ //now build the full command line
+ List<String> fullCommand = new ArrayList<String>();
+ fullCommand.add("java");
for (Map.Entry<String, String> e : jvmOpts.entrySet()) {
fullCommand.add(e.getKey());
if (e.getValue().length() > 0) {
@@ -79,16 +90,16 @@ class BatchCommandLineBuilder {
return fullCommand.toArray(new String[fullCommand.size()]);
}
-
- /**
- * Take the input args and separate them into args that belong on the
commandline
- * and those that belong as jvm args for the parent process.
- * @param args -- literal args from TikaCLI commandline
- * @param commandLine args that should be part of the batch commandline
- * @param jvmArgs args that belong as jvm arguments for the parent process
- */
- private static void mapifyArgs(final String[] args,
- final Map<String, String> commandLine,
+
+ /**
+ * Take the input args and separate them into args that belong on the
commandline
+ * and those that belong as jvm args for the child process.
+ * @param args -- literal args from TikaCLI commandline
+ * @param commandLine args that should be part of the batch commandline
+ * @param jvmArgs args that belong as jvm arguments for the child process
+ */
+ private static void mapifyArgs(final String[] args,
+ final Map<String, String> commandLine,
final Map<String, String> jvmArgs) {
if (args.length == 0) {
@@ -187,8 +198,9 @@ class BatchCommandLineBuilder {
if (map.containsKey("--outputDir") || map.containsKey("-o")) {
String v1 = map.remove("--outputDir");
String v2 = map.remove("-o");
- String v = (v1 == null) ? v2 : v1;
- map.put("-outputDir", v);
- }
- }
-}
+ String v = (v1 == null) ? v2 : v1;
+ map.put("-outputDir", v);
+ }
+
+ }
+}
Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1670749&r1=1670748&r2=1670749&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
(original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Wed Apr
1 18:27:23 2015
@@ -21,7 +21,6 @@ import javax.xml.transform.TransformerCo
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
-
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
@@ -65,7 +64,6 @@ import org.apache.poi.poifs.filesystem.D
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.Tika;
import org.apache.tika.batch.BatchProcessDriverCLI;
-import org.apache.tika.batch.fs.FSBatchProcessCLI;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.CompositeDetector;
import org.apache.tika.detect.DefaultDetector;
@@ -115,10 +113,17 @@ public class TikaCLI {
private static final Log logger = LogFactory.getLog(TikaCLI.class);
public static void main(String[] args) throws Exception {
+
+ String log4jFile = System.getProperty("log4j.configuration");
+ if (log4jFile == null || log4jFile.trim().length()==0) {
+ BasicConfigurator.configure(
+ new WriterAppender(new SimpleLayout(), System.err));
+ Logger.getRootLogger().setLevel(Level.INFO);
+ }
+
TikaCLI cli = new TikaCLI();
if (cli.testForHelp(args)) {
- FSBatchProcessCLI batchProcessCLI = new FSBatchProcessCLI(args);
cli.usage();
return;
} else if (cli.testForBatch(args)) {
@@ -128,10 +133,6 @@ public class TikaCLI {
return;
}
- BasicConfigurator.configure(
- new WriterAppender(new SimpleLayout(), System.err));
- Logger.getRootLogger().setLevel(Level.INFO);
-
if (args.length > 0) {
for (int i = 0; i < args.length; i++) {
cli.process(args[i]);
@@ -587,7 +588,7 @@ public class TikaCLI {
out.println();
out.println(" Simplest method.");
out.println(" Specify two directories as args with no other args:");
- out.println(" java -jar tika-app.jar <inputDirectory>
<outputDirectory");
+ out.println(" java -jar tika-app.jar <inputDirectory>
<outputDirectory>");
out.println();
out.println("Batch Options:");
out.println(" -i or --inputDir Input directory");
@@ -610,7 +611,6 @@ public class TikaCLI {
out.println();
out.println(" To modify child process jvm args, prepend \"J\" as
in:");
out.println(" -JXmx4g or -JDlog4j.configuration=file:log4j.xml.");
-
}
private void version() {
Added: tika/trunk/tika-app/src/main/resources/log4j_batch_process.properties
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/resources/log4j_batch_process.properties?rev=1670749&view=auto
==============================================================================
--- tika/trunk/tika-app/src/main/resources/log4j_batch_process.properties
(added)
+++ tika/trunk/tika-app/src/main/resources/log4j_batch_process.properties Wed
Apr 1 18:27:23 2015
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#info,debug, error,fatal ...
+log4j.rootLogger=info,stdout
+
+#console
+log4j.appender.stdout=org.apache.log4j.ConsoleAppender
+log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
+
+
+log4j.appender.stdout.layout.ConversionPattern=%m%n
Added:
tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java?rev=1670749&view=auto
==============================================================================
---
tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java
(added)
+++
tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java
Wed Apr 1 18:27:23 2015
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.cli;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.io.PrintStream;
+import java.io.Reader;
+import java.util.List;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.serialization.JsonMetadataList;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TikaCLIBatchIntegrationTest {
+
+ private File testDataFile = new File("src/test/resources/test-data");
+
+ private File tempDir;
+ private OutputStream out = null;
+ private OutputStream err = null;
+ private ByteArrayOutputStream outBuffer = null;
+
+ @Before
+ public void setup() throws Exception {
+ tempDir = File.createTempFile("tika-cli-test-batch-", "");
+ tempDir.delete();
+ tempDir.mkdir();
+ outBuffer = new ByteArrayOutputStream();
+ PrintStream outWriter = new PrintStream(outBuffer, true,
IOUtils.UTF_8.name());
+ ByteArrayOutputStream errBuffer = new ByteArrayOutputStream();
+ PrintStream errWriter = new PrintStream(errBuffer, true,
IOUtils.UTF_8.name());
+ out = System.out;
+ err = System.err;
+ System.setOut(outWriter);
+ System.setErr(errWriter);
+ }
+
+ @After
+ public void tearDown() throws Exception {
+ System.setOut(new PrintStream(out, true, IOUtils.UTF_8.name()));
+ System.setErr(new PrintStream(err, true, IOUtils.UTF_8.name()));
+ FileUtils.deleteDirectory(tempDir);
+ }
+
+ @Test
+ public void testSimplestBatchIntegration() throws Exception {
+ String[] params = {escape(testDataFile.getAbsolutePath()),
+ escape(tempDir.getAbsolutePath())};
+ TikaCLI.main(params);
+
+ assertTrue("bad_xml.xml.xml", new File(tempDir,
"bad_xml.xml.xml").isFile());
+ assertTrue("coffee.xls.xml", new File(tempDir,
"coffee.xls.xml").exists());
+ }
+
+ @Test
+ public void testBasicBatchIntegration() throws Exception {
+ String[] params = {"-i", escape(testDataFile.getAbsolutePath()),
+ "-o", escape(tempDir.getAbsolutePath()),
+ "-numConsumers", "2"
+ };
+ TikaCLI.main(params);
+
+ assertTrue("bad_xml.xml.xml", new File(tempDir,
"bad_xml.xml.xml").isFile());
+ assertTrue("coffee.xls.xml", new File(tempDir,
"coffee.xls.xml").exists());
+ }
+
+ @Test
+ public void testJsonRecursiveBatchIntegration() throws Exception {
+ Reader reader = null;
+ try {
+ String[] params = {"-i", escape(testDataFile.getAbsolutePath()),
+ "-o", escape(tempDir.getAbsolutePath()),
+ "-numConsumers", "10",
+ "-J", //recursive Json
+ "-t" //plain text in content
+ };
+ TikaCLI.main(params);
+ reader = new InputStreamReader(
+ new FileInputStream(new File(tempDir,
"test_recursive_embedded.docx.json")), IOUtils.UTF_8);
+ List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
+ assertEquals(12, metadataList.size());
+
assertTrue(metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).contains("human
events"));
+ } finally {
+ IOUtils.closeQuietly(reader);
+ }
+ }
+
+ @Test
+ public void testProcessLogFileConfig() throws Exception {
+ String[] params = {"-i", escape(testDataFile.getAbsolutePath()),
+ "-o", escape(tempDir.getAbsolutePath()),
+ "-numConsumers", "2",
+ "-JDlog4j.configuration=log4j_batch_process_test.properties"};
+ TikaCLI.main(params);
+
+ assertTrue("bad_xml.xml.xml", new File(tempDir,
"bad_xml.xml.xml").isFile());
+ assertTrue("coffee.xls.xml", new File(tempDir,
"coffee.xls.xml").exists());
+ String sysOutString = new String(outBuffer.toByteArray(),
IOUtils.UTF_8);
+ assertTrue(sysOutString.contains("MY_CUSTOM_LOG_CONFIG"));
+ }
+
+ public static String escape(String path) {
+ if (path.indexOf(' ') > -1) {
+ return '"' + path + '"';
+ }
+ return path;
+ }
+
+}
Modified: tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java?rev=1670749&r1=1670748&r2=1670749&view=diff
==============================================================================
--- tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
(original)
+++ tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java Wed
Apr 1 18:27:23 2015
@@ -16,26 +16,17 @@
*/
package org.apache.tika.cli;
-import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import java.io.ByteArrayOutputStream;
import java.io.File;
-import java.io.FileInputStream;
-import java.io.InputStreamReader;
-import java.io.OutputStream;
import java.io.PrintStream;
-import java.io.Reader;
import java.net.URI;
-import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.IOUtils;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.serialization.JsonMetadataList;
-import org.apache.tika.parser.RecursiveParserWrapper;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
@@ -387,97 +378,4 @@ public class TikaCLITest {
assertTrue(content.contains("\\n\\nembed_4\\n"));
assertTrue(content.contains("\\n\\nembed_0"));
}
-
- @Test
- public void testSimplestBatchIntegration() throws Exception {
- File tempDir = File.createTempFile("tika-cli-test-batch-", "");
- tempDir.delete();
- tempDir.mkdir();
- ByteArrayOutputStream outBuffer = new ByteArrayOutputStream();
- PrintStream writer = new PrintStream(outBuffer, true,
IOUtils.UTF_8.name());
- OutputStream os = System.out;
- System.setOut(writer);
- try {
- String[] params = {escape(testDataFile.getAbsolutePath()),
- escape(tempDir.getAbsolutePath())};
- TikaCLI.main(params);
-
- StringBuffer allFiles = new StringBuffer();
- assertTrue("bad_xml.xml.xml", new File(tempDir,
"bad_xml.xml.xml").isFile());
- assertTrue("coffee.xls.xml", new File(tempDir,
"coffee.xls.xml").exists());
- } finally {
- //reset in case something went horribly wrong
- System.setOut(new PrintStream(os, true, IOUtils.UTF_8.name()));
- FileUtils.deleteDirectory(tempDir);
- }
- }
-
- @Test
- public void testBasicBatchIntegration() throws Exception {
- File tempDir = File.createTempFile("tika-cli-test-batch-", "");
- tempDir.delete();
- tempDir.mkdir();
- ByteArrayOutputStream outBuffer = new ByteArrayOutputStream();
- PrintStream writer = new PrintStream(outBuffer, true,
IOUtils.UTF_8.name());
- OutputStream os = System.out;
- System.setOut(writer);
- try {
- String[] params = {"-i", escape(testDataFile.getAbsolutePath()),
- "-o", escape(tempDir.getAbsolutePath()),
- "-numConsumers", "2",
- "-reporterSleepMillis", "100"};//report often to make sure
- TikaCLI.main(params);
-
- StringBuffer allFiles = new StringBuffer();
- assertTrue("bad_xml.xml.xml", new File(tempDir,
"bad_xml.xml.xml").isFile());
- assertTrue("coffee.xls.xml", new File(tempDir,
"coffee.xls.xml").exists());
- String sysOutString = new String(outBuffer.toByteArray(),
IOUtils.UTF_8);
-
- assertEquals(-1, sysOutString.indexOf("There are 3 file processors
still active"));
- assertTrue(sysOutString.indexOf("There are 2 file processors") >
-1);
- } finally {
- //reset in case something went horribly wrong
- System.setOut(new PrintStream(os, true, IOUtils.UTF_8.name()));
- FileUtils.deleteDirectory(tempDir);
- }
- }
-
- @Test
- public void testJsonRecursiveBatchIntegration() throws Exception {
- File tempDir = File.createTempFile("tika-cli-test-batch-", "");
- tempDir.delete();
- tempDir.mkdir();
- ByteArrayOutputStream outBuffer = new ByteArrayOutputStream();
- PrintStream writer = new PrintStream(outBuffer, true,
IOUtils.UTF_8.name());
- OutputStream os = System.out;
- System.setOut(writer);
- Reader reader = null;
- try {
- String[] params = {"-i", escape(testDataFile.getAbsolutePath()),
- "-o", escape(tempDir.getAbsolutePath()),
- "-numConsumers", "10",
- "-J", //recursive Json
- "-t" //plain text in content
- };
- TikaCLI.main(params);
- reader = new InputStreamReader(
- new FileInputStream(new File(tempDir,
"test_recursive_embedded.docx.json")), IOUtils.UTF_8);
- List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
- assertEquals(12, metadataList.size());
-
assertTrue(metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).contains("human
events"));
- } finally {
- IOUtils.closeQuietly(reader);
- //reset in case something went horribly wrong
- System.setOut(new PrintStream(os, true, IOUtils.UTF_8.name()));
- FileUtils.deleteDirectory(tempDir);
- }
- }
-
-
- public static String escape(String path) {
- if (path.indexOf(' ') > -1){
- return '"'+path+'"';
- }
- return path;
- }
}
Added:
tika/trunk/tika-app/src/test/resources/log4j_batch_process_test.properties
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/resources/log4j_batch_process_test.properties?rev=1670749&view=auto
==============================================================================
--- tika/trunk/tika-app/src/test/resources/log4j_batch_process_test.properties
(added)
+++ tika/trunk/tika-app/src/test/resources/log4j_batch_process_test.properties
Wed Apr 1 18:27:23 2015
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#info,debug, error,fatal ...
+log4j.rootLogger=info,stdout
+
+#console
+log4j.appender.stdout=org.apache.log4j.ConsoleAppender
+log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
+
+
+log4j.appender.stdout.layout.ConversionPattern=MY_CUSTOM_LOG_CONFIG %m%n
Modified:
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcess.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcess.java?rev=1670749&r1=1670748&r2=1670749&view=diff
==============================================================================
--- tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcess.java
(original)
+++ tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcess.java
Wed Apr 1 18:27:23 2015
@@ -127,15 +127,17 @@ public class BatchProcess implements Cal
*/
public ParallelFileProcessingResult call()
throws InterruptedException {
- if (alreadyExecuted) {
- throw new IllegalStateException("Can only execute BatchRunner
once.");
- }
- //redirect streams
- try {
- outputStreamWriter = new PrintStream(System.err, true,
IOUtils.UTF_8.toString());
- } catch (IOException e) {
- throw new RuntimeException("Can't redirect streams");
- }
+ if (alreadyExecuted) {
+ throw new IllegalStateException("Can only execute BatchRunner
once.");
+ }
+ //redirect streams; all organic warnings should go to System.err;
+ //System.err should be redirected to System.out
+ PrintStream sysErr = System.err;
+ try {
+ outputStreamWriter = new PrintStream(sysErr, true,
IOUtils.UTF_8.toString());
+ } catch (IOException e) {
+ throw new RuntimeException("Can't redirect streams");
+ }
System.setErr(System.out);
ParallelFileProcessingResult result = null;
@@ -152,13 +154,13 @@ public class BatchProcess implements Cal
TimeoutChecker timeoutChecker = new TimeoutChecker();
try {
- startConsumersManager();
- } catch (BatchNoRestartError e) {
- return new
- ParallelFileProcessingResult(0, 0, 0,
- 0, BatchProcessDriverCLI.PROCESS_NO_RESTART_EXIT_CODE,
-
CAUSE_FOR_TERMINATION.CONSUMERS_MANAGER_DIDNT_INIT_IN_TIME_NO_RESTART.toString());
-
+ startConsumersManager();
+ } catch (BatchNoRestartError e) {
+ return new
+ ParallelFileProcessingResult(0, 0, 0, 0,
+ 0, BatchProcessDriverCLI.PROCESS_NO_RESTART_EXIT_CODE,
+
CAUSE_FOR_TERMINATION.CONSUMERS_MANAGER_DIDNT_INIT_IN_TIME_NO_RESTART.toString());
+
}
State state = mainLoop(completionService, timeoutChecker);
@@ -198,13 +200,12 @@ public class BatchProcess implements Cal
if (futureResult != null) {
state.removed++;
- IFileProcessorFutureResult result = futureResult.get();
- if (result instanceof FileConsumerFutureResult) {
- state.consumersRemoved++;
- state.processed += ((FileConsumerFutureResult)
result).getFilesProcessed();
- } else if (result instanceof
FileResourceCrawlerFutureResult) {
- state.crawlersRemoved++;
- if (fileResourceCrawler.wasTimedOut()) {
+ IFileProcessorFutureResult result = futureResult.get();
+ if (result instanceof FileConsumerFutureResult) {
+ state.consumersRemoved++;
+ } else if (result instanceof
FileResourceCrawlerFutureResult) {
+ state.crawlersRemoved++;
+ if (fileResourceCrawler.wasTimedOut()) {
causeForTermination =
CAUSE_FOR_TERMINATION.CRAWLER_TIMED_OUT;
break;
}
@@ -290,13 +291,12 @@ public class BatchProcess implements Cal
break;
}
try {
- IFileProcessorFutureResult result = future.get();
- if (result instanceof FileConsumerFutureResult) {
- FileConsumerFutureResult consumerResult =
(FileConsumerFutureResult) result;
- state.processed += consumerResult.getFilesProcessed();
- FileStarted fileStarted = consumerResult.getFileStarted();
- if (fileStarted != null
- && fileStarted.getElapsedMillis() >
timeoutThresholdMillis) {
+ IFileProcessorFutureResult result = future.get();
+ if (result instanceof FileConsumerFutureResult) {
+ FileConsumerFutureResult consumerResult =
(FileConsumerFutureResult) result;
+ FileStarted fileStarted = consumerResult.getFileStarted();
+ if (fileStarted != null
+ && fileStarted.getElapsedMillis() >
timeoutThresholdMillis) {
logger.warn(fileStarted.getResourceId()
+ "\t caused a file processor to hang or
crash. You may need to remove "
+ "this file from your input set and rerun.");
@@ -345,18 +345,23 @@ public class BatchProcess implements Cal
"< for " + fs.getElapsedMillis() + " milliseconds after it
started." +
" This exceeds the maxTimeoutMillis parameter");
}
- double elapsed = ((double) new Date().getTime() - (double)
state.start) / 1000.0;
- return new
- ParallelFileProcessingResult(considered, added, state.processed,
- elapsed, exitStatus, state.causeForTermination.toString());
- }
-
- private class State {
- long start = -1;
- int processed = 0;
- int numConsumers = 0;
- int numNonConsumers = 0;
- int removed = 0;
+ double elapsed = ((double) new Date().getTime() - (double)
state.start) / 1000.0;
+ int processed = 0;
+ int numExceptions = 0;
+ for (FileResourceConsumer c : consumersManager.getConsumers()) {
+ processed += c.getNumResourcesConsumed();
+ numExceptions += c.getNumHandledExceptions();
+ }
+ return new
+ ParallelFileProcessingResult(considered, added, processed,
numExceptions,
+ elapsed, exitStatus, state.causeForTermination.toString());
+ }
+
+ private class State {
+ long start = -1;
+ int numConsumers = 0;
+ int numNonConsumers = 0;
+ int removed = 0;
int consumersRemoved = 0;
int crawlersRemoved = 0;
CAUSE_FOR_TERMINATION causeForTermination = null;
@@ -574,12 +579,14 @@ public class BatchProcess implements Cal
}
}
}
- }
-
- private class TimeoutFutureResult implements IFileProcessorFutureResult {
- private final int timedOutCount;
-
- private TimeoutFutureResult(final int timedOutCount) {
+ }
+
+ private class TimeoutFutureResult implements IFileProcessorFutureResult {
+ //used to be used when more than one timeout was allowed
+ //TODO: get rid of this?
+ private final int timedOutCount;
+
+ private TimeoutFutureResult(final int timedOutCount) {
this.timedOutCount = timedOutCount;
}
Modified:
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcessDriverCLI.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcessDriverCLI.java?rev=1670749&r1=1670748&r2=1670749&view=diff
==============================================================================
---
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcessDriverCLI.java
(original)
+++
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcessDriverCLI.java
Wed Apr 1 18:27:23 2015
@@ -102,21 +102,21 @@ public class BatchProcessDriverCLI {
}
public void execute() throws Exception {
-
- interruptWatcherThread.setDaemon(true);
- interruptWatcherThread.start();
- logger.trace("about to start");
- start();
- int loopsAfterRestartMessageReceived = 0;
- while (!userInterrupted) {
+
+ interruptWatcherThread.setDaemon(true);
+ interruptWatcherThread.start();
+ logger.info("about to start driver");
+ start();
+ int loopsAfterRestartMessageReceived = 0;
+ while (!userInterrupted) {
Integer exit = null;
- try {
- logger.trace("about to check exit value");
- exit = process.exitValue();
- logger.trace("exit value:" + exit);
- stop();
- } catch (IllegalThreadStateException e) {
- //hasn't exited
+ try {
+ logger.trace("about to check exit value");
+ exit = process.exitValue();
+ logger.info("The child process has finished with an exit value
of: "+exit);
+ stop();
+ } catch (IllegalThreadStateException e) {
+ //hasn't exited
logger.trace("process has not exited;
IllegalThreadStateException");
}
@@ -135,13 +135,13 @@ public class BatchProcessDriverCLI {
" exit=" + exit + " receivedRestartMsg=" +
receivedRestartMsg);
//if we've gotten the message via stdout to restart
//but the process hasn't exited yet, give it another
- //chance
- if (receivedRestartMsg && exit == null) {
- loopsAfterRestartMessageReceived++;
- logger.trace("Must restart, still not exited; loops after
restart: " +
- loopsAfterRestartMessageReceived);
- continue;
- }
+ //chance
+ if (receivedRestartMsg && exit == null) {
+ loopsAfterRestartMessageReceived++;
+ logger.warn("Must restart, still not exited; loops after
restart: " +
+ loopsAfterRestartMessageReceived);
+ continue;
+ }
if (loopsAfterRestartMessageReceived >
waitNumLoopsAfterRestartmessage) {
logger.trace("About to try to restart because:" +
" exit=" + exit + " receivedRestartMsg=" +
receivedRestartMsg);
@@ -153,13 +153,13 @@ public class BatchProcessDriverCLI {
}
} else if (exit != null && exit !=
BatchProcessDriverCLI.PROCESS_NO_RESTART_EXIT_CODE
&& exit !=
BatchProcessDriverCLI.PROCESS_COMPLETED_SUCCESSFULLY) {
- logger.trace("About to try to restart because:" +
- " exit=" + exit + " receivedRestartMsg=" +
receivedRestartMsg);
-
- if (exit != null && exit ==
BatchProcessDriverCLI.PROCESS_RESTART_EXIT_CODE) {
- logger.info("Restarting on expected restart code");
- } else {
- logger.warn("Restarting on unexpected restart code:
"+exit);
+ logger.trace("About to try to restart because:" +
+ " exit=" + exit + " receivedRestartMsg=" +
receivedRestartMsg);
+
+ if (exit == BatchProcessDriverCLI.PROCESS_RESTART_EXIT_CODE) {
+ logger.info("Restarting on expected restart code");
+ } else {
+ logger.warn("Restarting on unexpected restart code:
"+exit);
}
boolean restarted = restart(exit, receivedRestartMsg);
if (!restarted) {
@@ -170,17 +170,18 @@ public class BatchProcessDriverCLI {
logger.trace("Will not restart: "+exit);
break;
}
- }
- logger.trace("about to call shutdown driver now");
- shutdownDriverNow();
- }
-
- private void shutdownDriverNow() {
- if (process != null) {
- for (int i = 0; i < 10; i++) {
-
- logger.trace("trying to shut down: "+i);
- try {
+ }
+ logger.trace("about to call shutdown driver now");
+ shutdownDriverNow();
+ logger.info("Process driver has completed");
+ }
+
+ private void shutdownDriverNow() {
+ if (process != null) {
+ for (int i = 0; i < 60; i++) {
+
+ logger.trace("trying to shut down: "+i);
+ try {
int exit = process.exitValue();
logger.trace("trying to stop:"+exit);
stop();
@@ -192,13 +193,13 @@ public class BatchProcessDriverCLI {
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
- //swallow
- }
- }
- logger.error("Process didn't stop after 10 seconds after shutdown.
" +
- "I am forcefully killing it.");
- }
- interruptWatcherThread.interrupt();
+ //swallow
+ }
+ }
+ logger.error("Process didn't stop after 60 seconds after shutdown.
" +
+ "I am forcefully killing it.");
+ }
+ interruptWatcherThread.interrupt();
}
public int getNumRestarts() {
@@ -260,12 +261,17 @@ public class BatchProcessDriverCLI {
interruptWriter = new InterruptWriter(process.getOutputStream());
interruptWriterThread = new Thread(interruptWriter);
interruptWriterThread.start();
-
- }
-
- public void setRedirectChildProcessToStdOut(boolean
redirectChildProcessToStdOut) {
- this.redirectChildProcessToStdOut = redirectChildProcessToStdOut;
- }
+
+ }
+
+ /**
+ * Typically only used for testing. This determines whether or not
+ * to redirect child process's stdOut to driver's stdout
+ * @param redirectChildProcessToStdOut should the driver redirect the
child's stdout
+ */
+ public void setRedirectChildProcessToStdOut(boolean
redirectChildProcessToStdOut) {
+ this.redirectChildProcessToStdOut = redirectChildProcessToStdOut;
+ }
/**
* Class to watch stdin from the driver for anything that is typed.
Modified:
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/FileResourceConsumer.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/FileResourceConsumer.java?rev=1670749&r1=1670748&r2=1670749&view=diff
==============================================================================
---
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/FileResourceConsumer.java
(original)
+++
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/FileResourceConsumer.java
Wed Apr 1 18:27:23 2015
@@ -20,21 +20,24 @@ package org.apache.tika.batch;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamWriter;
-import java.io.Closeable;
-import java.io.Flushable;
-import java.io.IOException;
-import java.io.PrintWriter;
-import java.io.StringWriter;
-import java.util.Date;
+import java.io.Closeable;
+import java.io.Flushable;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PrintWriter;
+import java.io.StringWriter;
+import java.util.Date;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.Callable;
-import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
-import org.apache.log4j.Level;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import org.slf4j.MarkerFactory;
+import org.xml.sax.ContentHandler;
/**
@@ -42,23 +45,29 @@ import org.slf4j.MarkerFactory;
* goal of this class is to abstract out the multithreading
* and recordkeeping components.
* <p/>
- */
-public abstract class FileResourceConsumer implements
Callable<IFileProcessorFutureResult> {
-
- private static enum STATE {
- NOT_YET_STARTED,
- ACTIVELY_CONSUMING,
- SWALLOWED_POISON,
+ */
+public abstract class FileResourceConsumer implements
Callable<IFileProcessorFutureResult> {
+
+ private enum STATE {
+ NOT_YET_STARTED,
+ ACTIVELY_CONSUMING,
+ SWALLOWED_POISON,
THREAD_INTERRUPTED,
EXCEEDED_MAX_CONSEC_WAIT_MILLIS,
ASKED_TO_SHUTDOWN,
TIMED_OUT,
CONSUMER_EXCEPTION,
CONSUMER_ERROR,
- COMPLETED
- }
-
- public static String TIME_OUT = "timeout";
+ COMPLETED
+ }
+
+ public static String TIMED_OUT = "timed_out";
+ public static String OOM = "oom";
+ public static String IO_IS = "io_on_inputstream";
+ public static String IO_OS = "io_on_outputstream";
+ public static String PARSE_ERR = "parse_err";
+ public static String PARSE_EX = "parse_ex";
+
public static String ELAPSED_MILLIS = "elapsedMS";
private static AtomicInteger numConsumers = new AtomicInteger(-1);
@@ -248,34 +257,35 @@ public abstract class FileResourceConsum
FileStarted tmp = currentFile;
if (tmp == null) {
return null;
- }
- if (tmp.getElapsedMillis() > staleThresholdMillis) {
- setEndedState(STATE.TIMED_OUT);
- logWithResourceId(Level.FATAL, TIME_OUT,
- tmp.getResourceId(), ELAPSED_MILLIS,
Long.toString(tmp.getElapsedMillis()));
- return tmp;
- }
- }
- return null;
- }
-
- protected void logWithResourceId(Level level, String type, String
resourceId, String... attrs) {
- logWithResourceId(level, type, resourceId, null, attrs);
- }
-
- /**
- * Use this for structured output that captures resourceId and other
attributes.
- *
- * @param level level
- * @param type entity name for exception
- * @param resourceId resourceId string
- * @param t throwable can be null
- * @param attrs (array of key0, value0, key1, value1, etc.)
- */
- protected void logWithResourceId(Level level, String type, String
resourceId, Throwable t, String... attrs) {
-
- StringWriter writer = new StringWriter();
- try {
+ }
+ if (tmp.getElapsedMillis() > staleThresholdMillis) {
+ setEndedState(STATE.TIMED_OUT);
+ logger.error("{}", getXMLifiedLogMsg(
+ TIMED_OUT,
+ tmp.getResourceId(),
+ ELAPSED_MILLIS,
Long.toString(tmp.getElapsedMillis())));
+ return tmp;
+ }
+ }
+ return null;
+ }
+
+ protected String getXMLifiedLogMsg(String type, String resourceId,
String... attrs) {
+ return getXMLifiedLogMsg(type, resourceId, null, attrs);
+ }
+
+ /**
+ * Use this for structured output that captures resourceId and other
attributes.
+ *
+ * @param type entity name for exception
+ * @param resourceId resourceId string
+ * @param t throwable can be null
+ * @param attrs (array of key0, value0, key1, value1, etc.)
+ */
+ protected String getXMLifiedLogMsg(String type, String resourceId,
Throwable t, String... attrs) {
+
+ StringWriter writer = new StringWriter();
+ try {
XMLStreamWriter xml =
xmlOutputFactory.createXMLStreamWriter(writer);
xml.writeStartDocument();
xml.writeStartElement(type);
@@ -299,23 +309,7 @@ public abstract class FileResourceConsum
} catch (XMLStreamException e) {
logger.error("error writing xml stream for: " + resourceId, t);
}
- switch (level.toInt()) {
- case Level.FATAL_INT:
- logger.error(MarkerFactory.getMarker("FATAL"),
writer.toString());
- break;
- case Level.ERROR_INT:
- logger.error(writer.toString());
- break;
- case Level.WARN_INT:
- logger.warn(writer.toString());
- break;
- case Level.DEBUG_INT :
- logger.debug(writer.toString());
- break;
- case Level.TRACE_INT :
- logger.trace(writer.toString());
- break;
- };
+ return writer.toString();
}
private FileResource getNextFileResource() throws InterruptedException {
@@ -388,10 +382,49 @@ public abstract class FileResourceConsum
synchronized(lock) {
if (currentState == STATE.NOT_YET_STARTED ||
currentState == STATE.ACTIVELY_CONSUMING ||
- currentState == STATE.ASKED_TO_SHUTDOWN) {
- currentState = cause;
- }
- }
- }
-
+ currentState == STATE.ASKED_TO_SHUTDOWN) {
+ currentState = cause;
+ }
+ }
+ }
+
+ /**
+ * Utility method to handle logging equivalently among all
+ * implementing classes. Use, override or avoid as desired.
+ * <p>
+ * This will throw Errors, but it will catch all Exceptions and log them
+ * @param resourceId resourceId
+ * @param parser parser to use
+ * @param is inputStream (will be closed by this method!)
+ * @param handler handler for the content
+ * @param m metadata
+ * @param parseContext parse context
+ * @throws Throwable
+ */
+ protected void parse(final String resourceId, final Parser parser,
InputStream is,
+ final ContentHandler handler,
+ final Metadata m, final ParseContext parseContext)
throws Throwable {
+
+ try {
+ parser.parse(is, handler, m, parseContext);
+ } catch (Throwable t) {
+ if (t instanceof OutOfMemoryError) {
+ logger.error(getXMLifiedLogMsg(OOM,
+ resourceId, t));
+ throw t;
+ } else if (t instanceof Error) {
+ logger.error(getXMLifiedLogMsg(PARSE_ERR,
+ resourceId, t));
+ throw t;
+ } else {
+ //warn, but do not rethrow
+ logger.warn(getXMLifiedLogMsg(PARSE_EX,
+ resourceId, t));
+ incrementHandledExceptions();
+ }
+ } finally {
+ close(is);
+ }
+ }
+
}
Modified:
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/ParallelFileProcessingResult.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/ParallelFileProcessingResult.java?rev=1670749&r1=1670748&r2=1670749&view=diff
==============================================================================
---
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/ParallelFileProcessingResult.java
(original)
+++
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/ParallelFileProcessingResult.java
Wed Apr 1 18:27:23 2015
@@ -18,22 +18,26 @@ package org.apache.tika.batch;
*/
public class ParallelFileProcessingResult {
- private final int considered;
- private final int added;
- private final int consumed;
- private final double secondsElapsed;
- private final int exitStatus;
- private final String causeForTermination;
-
- public ParallelFileProcessingResult(int considered, int added, int
consumed, double secondsElapsed,
- int exitStatus,
- String causeForTermination) {
- this.considered = considered;
- this.added = added;
- this.consumed = consumed;
- this.secondsElapsed = secondsElapsed;
- this.exitStatus = exitStatus;
- this.causeForTermination = causeForTermination;
+ private final int considered;
+ private final int added;
+ private final int consumed;
+ private final int numberHandledExceptions;
+ private final double secondsElapsed;
+ private final int exitStatus;
+ private final String causeForTermination;
+
+ public ParallelFileProcessingResult(int considered, int added,
+ int consumed, int
numberHandledExceptions,
+ double secondsElapsed,
+ int exitStatus,
+ String causeForTermination) {
+ this.considered = considered;
+ this.added = added;
+ this.consumed = consumed;
+ this.numberHandledExceptions = numberHandledExceptions;
+ this.secondsElapsed = secondsElapsed;
+ this.exitStatus = exitStatus;
+ this.causeForTermination = causeForTermination;
}
/**
@@ -75,12 +79,16 @@ public class ParallelFileProcessingResul
* @return seconds elapsed since the start of the batch processing
*/
public double secondsElapsed() {
- return secondsElapsed;
- }
-
- /**
- *
- * @return intendedExitStatus
+ return secondsElapsed;
+ }
+
+ public int getNumberHandledExceptions() {
+ return numberHandledExceptions;
+ }
+
+ /**
+ *
+ * @return intendedExitStatus
*/
public int getExitStatus() {
return exitStatus;
@@ -89,12 +97,13 @@ public class ParallelFileProcessingResul
@Override
public String toString() {
return "ParallelFileProcessingResult{" +
- "considered=" + considered +
- ", added=" + added +
- ", consumed=" + consumed +
- ", secondsElapsed=" + secondsElapsed +
- ", exitStatus=" + exitStatus +
- ", causeForTermination='" + causeForTermination + '\'' +
+ "considered=" + considered +
+ ", added=" + added +
+ ", consumed=" + consumed +
+ ", numberHandledExceptions=" + numberHandledExceptions +
+ ", secondsElapsed=" + secondsElapsed +
+ ", exitStatus=" + exitStatus +
+ ", causeForTermination='" + causeForTermination + '\'' +
'}';
}
}
Modified:
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/AbstractFSConsumer.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/AbstractFSConsumer.java?rev=1670749&r1=1670748&r2=1670749&view=diff
==============================================================================
---
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/AbstractFSConsumer.java
(original)
+++
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/AbstractFSConsumer.java
Wed Apr 1 18:27:23 2015
@@ -19,13 +19,12 @@ package org.apache.tika.batch.fs;
import java.io.IOException;
import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.concurrent.ArrayBlockingQueue;
-
-import org.apache.log4j.Level;
-import org.apache.tika.batch.BatchNoRestartError;
-import org.apache.tika.batch.FileResource;
-import org.apache.tika.batch.FileResourceConsumer;
+import java.io.OutputStream;
+import java.util.concurrent.ArrayBlockingQueue;
+
+import org.apache.tika.batch.BatchNoRestartError;
+import org.apache.tika.batch.FileResource;
+import org.apache.tika.batch.FileResourceConsumer;
import org.apache.tika.batch.OutputStreamFactory;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
@@ -52,54 +51,32 @@ public abstract class AbstractFSConsumer
OutputStream os = null;
try {
os = fsOSFactory.getOutputStream(fileResource.getMetadata());
- } catch (IOException e) {
- //This can happen if the disk has run out of space,
- //or if there was a failure with mkdirs in fsOSFactory
- logWithResourceId(Level.FATAL, "ioe_opening_os",
- fileResource.getResourceId(), e);
- throw new BatchNoRestartError("IOException trying to open output
stream for " +
- fileResource.getResourceId() + " :: " + e.getMessage());
- }
- return os;
- }
-
- protected InputStream getInputStream(FileResource fileResource) {
- InputStream is = null;
+ } catch (IOException e) {
+ //This can happen if the disk has run out of space,
+ //or if there was a failure with mkdirs in fsOSFactory
+ logger.error("{}", getXMLifiedLogMsg(IO_OS,
+ fileResource.getResourceId(), e));
+ throw new BatchNoRestartError("IOException trying to open output
stream for " +
+ fileResource.getResourceId() + " :: " + e.getMessage());
+ }
+ return os;
+ }
+
+ /**
+ *
+ * @param fileResource
+ * @return inputStream, can be null if there is an exception opening IS
+ */
+ protected InputStream getInputStream(FileResource fileResource) {
+ InputStream is = null;
try {
is = fileResource.openInputStream();
} catch (IOException e) {
- logWithResourceId(Level.FATAL, "ioe_opening_is",
- fileResource.getResourceId(), e);
+ logger.warn("{}", getXMLifiedLogMsg(IO_IS,
+ fileResource.getResourceId(), e));
flushAndClose(is);
}
- return is;
- }
-
- protected void parse(final String resourceId, final Parser parser,
InputStream is,
- final ContentHandler handler,
- final Metadata m, final ParseContext parseContext)
throws Throwable {
-
- Throwable thrown = null;
- try {
- parser.parse(is, handler, m, parseContext);
- } catch (Throwable t) {
- if (t instanceof OutOfMemoryError) {
- logWithResourceId(Level.ERROR, "oom",
- resourceId, t);
- } else if (t instanceof Error) {
- logWithResourceId(Level.ERROR, "parse_err",
- resourceId, t);
- } else {
- logWithResourceId(Level.WARN, "parse_ex",
- resourceId, t);
- incrementHandledExceptions();
- }
- thrown = t;
- } finally {
- close(is);
- }
- if (thrown != null) {
- throw thrown;
- }
- }
-}
+ return is;
+ }
+
+}
Modified:
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/BasicTikaFSConsumer.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/BasicTikaFSConsumer.java?rev=1670749&r1=1670748&r2=1670749&view=diff
==============================================================================
---
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/BasicTikaFSConsumer.java
(original)
+++
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/BasicTikaFSConsumer.java
Wed Apr 1 18:27:23 2015
@@ -19,13 +19,12 @@ package org.apache.tika.batch.fs;
import java.io.InputStream;
import java.io.OutputStream;
-import java.io.UnsupportedEncodingException;
-import java.util.concurrent.ArrayBlockingQueue;
-
-import org.apache.log4j.Level;
-import org.apache.tika.batch.FileResource;
-import org.apache.tika.batch.OutputStreamFactory;
-import org.apache.tika.batch.ParserFactory;
+import java.io.UnsupportedEncodingException;
+import java.util.concurrent.ArrayBlockingQueue;
+
+import org.apache.tika.batch.FileResource;
+import org.apache.tika.batch.OutputStreamFactory;
+import org.apache.tika.batch.ParserFactory;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.io.IOUtils;
import org.apache.tika.parser.ParseContext;
@@ -87,14 +86,14 @@ public class BasicTikaFSConsumer extends
}
ContentHandler handler;
try {
- handler = contentHandlerFactory.getNewContentHandler(os,
getOutputEncoding());
- } catch (UnsupportedEncodingException e) {
- incrementHandledExceptions();
- logWithResourceId(Level.FATAL, "output_encoding_ex",
- fileResource.getResourceId(), e);
- flushAndClose(os);
- throw new RuntimeException(e.getMessage());
- }
+ handler = contentHandlerFactory.getNewContentHandler(os,
getOutputEncoding());
+ } catch (UnsupportedEncodingException e) {
+ incrementHandledExceptions();
+ logger.error(getXMLifiedLogMsg("output_encoding_ex",
+ fileResource.getResourceId(), e));
+ flushAndClose(os);
+ throw new RuntimeException(e.getMessage());
+ }
//now actually call parse!
Throwable thrown = null;
Modified:
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/FSBatchProcessCLI.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/FSBatchProcessCLI.java?rev=1670749&r1=1670748&r2=1670749&view=diff
==============================================================================
---
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/FSBatchProcessCLI.java
(original)
+++
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/FSBatchProcessCLI.java
Wed Apr 1 18:27:23 2015
@@ -16,29 +16,24 @@ package org.apache.tika.batch.fs;
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
-import java.io.File;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.concurrent.ExecutorService;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
-import org.apache.commons.cli.HelpFormatter;
-import org.apache.commons.cli.Option;
-import org.apache.commons.cli.Options;
-import org.apache.log4j.BasicConfigurator;
-import org.apache.log4j.ConsoleAppender;
-import org.apache.log4j.Level;
-import org.apache.log4j.PatternLayout;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
import org.apache.tika.batch.BatchProcess;
import org.apache.tika.batch.BatchProcessDriverCLI;
-import org.apache.tika.batch.ParallelFileProcessingResult;
+import org.apache.tika.batch.ParallelFileProcessingResult;
import org.apache.tika.batch.builders.BatchProcessBuilder;
import org.apache.tika.batch.builders.CommandLineParserBuilder;
import org.apache.tika.io.IOUtils;
@@ -48,39 +43,42 @@ import org.slf4j.LoggerFactory;
import org.slf4j.MarkerFactory;
public class FSBatchProcessCLI {
+
public static String FINISHED_STRING = "Main thread in TikaFSBatchCLI has
finished processing.";
private static Logger logger =
LoggerFactory.getLogger(FSBatchProcessCLI.class);
private final Options options;
public FSBatchProcessCLI(String[] args) throws IOException {
- TikaInputStream configIs = null;
- try {
- configIs = getConfigInputStream(args);
- CommandLineParserBuilder builder = new CommandLineParserBuilder();
- options = builder.build(configIs);
- } finally {
+ TikaInputStream configIs = null;
+ try {
+ configIs = getConfigInputStream(args, true);
+ CommandLineParserBuilder builder = new CommandLineParserBuilder();
+ options = builder.build(configIs);
+ } finally {
IOUtils.closeQuietly(configIs);
}
}
public void usage() {
HelpFormatter helpFormatter = new HelpFormatter();
- helpFormatter.printHelp("tika filesystem batch", options);
- }
-
- private TikaInputStream getConfigInputStream(String[] args) throws
IOException {
- TikaInputStream is = null;
- File batchConfigFile = getConfigFile(args);
- if (batchConfigFile != null) {
+ helpFormatter.printHelp("tika filesystem batch", options);
+ }
+
+ private TikaInputStream getConfigInputStream(String[] args, boolean
logDefault) throws IOException {
+ TikaInputStream is = null;
+ File batchConfigFile = getConfigFile(args);
+ if (batchConfigFile != null) {
//this will throw IOException if it can't find a specified config
file
- //better to throw an exception than silently back off to default.
- is = TikaInputStream.get(batchConfigFile);
- } else {
- logger.info("No config file set via -bc, relying on
default-tika-batch-config.xml");
- is = TikaInputStream.get(
-
FSBatchProcessCLI.class.getResourceAsStream("default-tika-batch-config.xml"));
- }
+ //better to throw an exception than silently back off to default.
+ is = TikaInputStream.get(batchConfigFile);
+ } else {
+ if (logDefault) {
+ logger.info("No config file set via -bc, relying on
default-tika-batch-config.xml");
+ }
+ is = TikaInputStream.get(
+
FSBatchProcessCLI.class.getResourceAsStream("default-tika-batch-config.xml"));
+ }
return is;
}
@@ -104,13 +102,13 @@ public class FSBatchProcessCLI {
}
BatchProcessBuilder b = new BatchProcessBuilder();
- TikaInputStream is = null;
- BatchProcess process = null;
- try {
- is = getConfigInputStream(args);
- process = b.build(is, mapArgs);
- } finally {
- IOUtils.closeQuietly(is);
+ TikaInputStream is = null;
+ BatchProcess process = null;
+ try {
+ is = getConfigInputStream(args, false);
+ process = b.build(is, mapArgs);
+ } finally {
+ IOUtils.closeQuietly(is);
}
final Thread mainThread = Thread.currentThread();
@@ -134,22 +132,11 @@ public class FSBatchProcessCLI {
}
}
}
- return configFile;
- }
-
-
+ return configFile;
+ }
+
public static void main(String[] args) throws Exception {
- //if no log4j config file has been set via
- //sysprops, use BasicConfigurator
- //TODO: figure out if this can cleanly be moved to pure slf4j?
- String log4jFile = System.getProperty("log4j.configuration");
- if (log4jFile == null || log4jFile.trim().length()==0) {
- ConsoleAppender appender = new ConsoleAppender();
- appender.setLayout(new PatternLayout("%m%n"));
- appender.setWriter(new OutputStreamWriter(System.out,
IOUtils.UTF_8.name()));
- BasicConfigurator.configure(appender);
- org.apache.log4j.Logger.getRootLogger().setLevel(Level.INFO);
- }
+
try{
FSBatchProcessCLI cli = new FSBatchProcessCLI(args);
cli.execute(args);
Modified:
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java?rev=1670749&r1=1670748&r2=1670749&view=diff
==============================================================================
---
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java
(original)
+++
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java
Wed Apr 1 18:27:23 2015
@@ -22,10 +22,9 @@ import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.LinkedList;
-import java.util.List;
+import java.util.List;
import java.util.concurrent.ArrayBlockingQueue;
-import org.apache.log4j.Level;
import org.apache.tika.batch.FileResource;
import org.apache.tika.batch.OutputStreamFactory;
import org.apache.tika.batch.ParserFactory;
@@ -130,14 +129,16 @@ public class RecursiveParserWrapperFSCon
Writer writer = null;
try {
- writer = new OutputStreamWriter(os, getOutputEncoding());
- JsonMetadataList.toJson(metadataList, writer);
- } catch (Exception e) {
- logWithResourceId(Level.ERROR, "json_ex",
- fileResource.getResourceId(), e);
- } finally {
- flushAndClose(writer);
- }
+ writer = new OutputStreamWriter(os, getOutputEncoding());
+ JsonMetadataList.toJson(metadataList, writer);
+ } catch (Exception e) {
+ //this is a stop the world kind of thing
+ logger.error("{}", getXMLifiedLogMsg(IO_OS+"json",
+ fileResource.getResourceId(), e));
+ throw new RuntimeException(e);
+ } finally {
+ flushAndClose(writer);
+ }
if (thrown != null) {
if (thrown instanceof Error) {
Modified: tika/trunk/tika-batch/src/test/resources/log4j.properties
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/resources/log4j.properties?rev=1670749&r1=1670748&r2=1670749&view=diff
==============================================================================
--- tika/trunk/tika-batch/src/test/resources/log4j.properties (original)
+++ tika/trunk/tika-batch/src/test/resources/log4j.properties Wed Apr 1
18:27:23 2015
@@ -1,8 +1,22 @@
-
-log4j.rootLogger=OFF,A1
-
-#for debugging
-#log4j.rootLogger=TRACE,A1
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+log4j.rootLogger=OFF
+
+#for debugging
+#log4j.rootLogger=TRACE,A1
log4j.appender.A1=org.apache.log4j.ConsoleAppender
Modified: tika/trunk/tika-batch/src/test/resources/log4j_process.properties
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/resources/log4j_process.properties?rev=1670749&r1=1670748&r2=1670749&view=diff
==============================================================================
--- tika/trunk/tika-batch/src/test/resources/log4j_process.properties (original)
+++ tika/trunk/tika-batch/src/test/resources/log4j_process.properties Wed Apr
1 18:27:23 2015
@@ -1,8 +1,24 @@
-
-log4j.rootLogger=OFF,A1
-
-#for debugging
-#log4j.rootLogger=TRACE,A1
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#This is used by the batch process; see log4j.properties for the driver
+
+log4j.rootLogger=OFF
+
+#for debugging
+#log4j.rootLogger=TRACE,A1
log4j.appender.A1=org.apache.log4j.ConsoleAppender