Author: tallison Date: Wed Apr 1 18:27:23 2015 New Revision: 1670749 URL: http://svn.apache.org/r1670749 Log: TIKA-1330 clean up logging in tika-batch ant tika-app integration of tika-batch
Added: tika/trunk/tika-app/src/main/resources/log4j_batch_process.properties tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java tika/trunk/tika-app/src/test/resources/log4j_batch_process_test.properties Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcess.java tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcessDriverCLI.java tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/FileResourceConsumer.java tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/ParallelFileProcessingResult.java tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/AbstractFSConsumer.java tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/BasicTikaFSConsumer.java tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/FSBatchProcessCLI.java tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java tika/trunk/tika-batch/src/test/resources/log4j.properties tika/trunk/tika-batch/src/test/resources/log4j_process.properties Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java?rev=1670749&r1=1670748&r2=1670749&view=diff ============================================================================== --- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java (original) +++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/BatchCommandLineBuilder.java Wed Apr 1 18:27:23 2015 @@ -39,13 +39,13 @@ class BatchCommandLineBuilder { static Pattern JVM_OPTS_PATTERN = Pattern.compile("^(--?)J(.+)"); protected static String[] build(String[] args) throws IOException { - Map<String, String> processArgs = new LinkedHashMap<String, String>(); - Map<String, String> jvmOpts = new LinkedHashMap<String,String>(); - //take the args, and divide them into process args and options for - //the parent jvm process (i.e. log files, etc) - mapifyArgs(args, processArgs, jvmOpts); - - //now modify processArgs in place + Map<String, String> processArgs = new LinkedHashMap<String, String>(); + Map<String, String> jvmOpts = new LinkedHashMap<String,String>(); + //take the args, and divide them into process args and options for + //the child jvm process (i.e. log files, etc) + mapifyArgs(args, processArgs, jvmOpts); + + //now modify processArgs in place translateCommandLine(args, processArgs); //maybe the user specified a different classpath?! @@ -56,12 +56,23 @@ class BatchCommandLineBuilder { if (cp.contains(" ")){ cp = "\""+cp+"\""; } - jvmOpts.put("-cp", cp); - } - - //now build the full command line - List<String> fullCommand = new ArrayList<String>(); - fullCommand.add("java"); + jvmOpts.put("-cp", cp); + } + + boolean hasLog4j = false; + for (String k : jvmOpts.keySet()) { + if (k.startsWith("-Dlog4j.configuration=")) { + hasLog4j = true; + break; + } + } + //use the log4j config file inside the app /resources/log4j_batch_process.properties + if (! hasLog4j) { + jvmOpts.put("-Dlog4j.configuration=\"log4j_batch_process.properties\"", ""); + } + //now build the full command line + List<String> fullCommand = new ArrayList<String>(); + fullCommand.add("java"); for (Map.Entry<String, String> e : jvmOpts.entrySet()) { fullCommand.add(e.getKey()); if (e.getValue().length() > 0) { @@ -79,16 +90,16 @@ class BatchCommandLineBuilder { return fullCommand.toArray(new String[fullCommand.size()]); } - - /** - * Take the input args and separate them into args that belong on the commandline - * and those that belong as jvm args for the parent process. - * @param args -- literal args from TikaCLI commandline - * @param commandLine args that should be part of the batch commandline - * @param jvmArgs args that belong as jvm arguments for the parent process - */ - private static void mapifyArgs(final String[] args, - final Map<String, String> commandLine, + + /** + * Take the input args and separate them into args that belong on the commandline + * and those that belong as jvm args for the child process. + * @param args -- literal args from TikaCLI commandline + * @param commandLine args that should be part of the batch commandline + * @param jvmArgs args that belong as jvm arguments for the child process + */ + private static void mapifyArgs(final String[] args, + final Map<String, String> commandLine, final Map<String, String> jvmArgs) { if (args.length == 0) { @@ -187,8 +198,9 @@ class BatchCommandLineBuilder { if (map.containsKey("--outputDir") || map.containsKey("-o")) { String v1 = map.remove("--outputDir"); String v2 = map.remove("-o"); - String v = (v1 == null) ? v2 : v1; - map.put("-outputDir", v); - } - } -} + String v = (v1 == null) ? v2 : v1; + map.put("-outputDir", v); + } + + } +} Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1670749&r1=1670748&r2=1670749&view=diff ============================================================================== --- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java (original) +++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Wed Apr 1 18:27:23 2015 @@ -21,7 +21,6 @@ import javax.xml.transform.TransformerCo import javax.xml.transform.sax.SAXTransformerFactory; import javax.xml.transform.sax.TransformerHandler; import javax.xml.transform.stream.StreamResult; - import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; @@ -65,7 +64,6 @@ import org.apache.poi.poifs.filesystem.D import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.tika.Tika; import org.apache.tika.batch.BatchProcessDriverCLI; -import org.apache.tika.batch.fs.FSBatchProcessCLI; import org.apache.tika.config.TikaConfig; import org.apache.tika.detect.CompositeDetector; import org.apache.tika.detect.DefaultDetector; @@ -115,10 +113,17 @@ public class TikaCLI { private static final Log logger = LogFactory.getLog(TikaCLI.class); public static void main(String[] args) throws Exception { + + String log4jFile = System.getProperty("log4j.configuration"); + if (log4jFile == null || log4jFile.trim().length()==0) { + BasicConfigurator.configure( + new WriterAppender(new SimpleLayout(), System.err)); + Logger.getRootLogger().setLevel(Level.INFO); + } + TikaCLI cli = new TikaCLI(); if (cli.testForHelp(args)) { - FSBatchProcessCLI batchProcessCLI = new FSBatchProcessCLI(args); cli.usage(); return; } else if (cli.testForBatch(args)) { @@ -128,10 +133,6 @@ public class TikaCLI { return; } - BasicConfigurator.configure( - new WriterAppender(new SimpleLayout(), System.err)); - Logger.getRootLogger().setLevel(Level.INFO); - if (args.length > 0) { for (int i = 0; i < args.length; i++) { cli.process(args[i]); @@ -587,7 +588,7 @@ public class TikaCLI { out.println(); out.println(" Simplest method."); out.println(" Specify two directories as args with no other args:"); - out.println(" java -jar tika-app.jar <inputDirectory> <outputDirectory"); + out.println(" java -jar tika-app.jar <inputDirectory> <outputDirectory>"); out.println(); out.println("Batch Options:"); out.println(" -i or --inputDir Input directory"); @@ -610,7 +611,6 @@ public class TikaCLI { out.println(); out.println(" To modify child process jvm args, prepend \"J\" as in:"); out.println(" -JXmx4g or -JDlog4j.configuration=file:log4j.xml."); - } private void version() { Added: tika/trunk/tika-app/src/main/resources/log4j_batch_process.properties URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/resources/log4j_batch_process.properties?rev=1670749&view=auto ============================================================================== --- tika/trunk/tika-app/src/main/resources/log4j_batch_process.properties (added) +++ tika/trunk/tika-app/src/main/resources/log4j_batch_process.properties Wed Apr 1 18:27:23 2015 @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#info,debug, error,fatal ... +log4j.rootLogger=info,stdout + +#console +log4j.appender.stdout=org.apache.log4j.ConsoleAppender +log4j.appender.stdout.layout=org.apache.log4j.PatternLayout + + +log4j.appender.stdout.layout.ConversionPattern=%m%n Added: tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java?rev=1670749&view=auto ============================================================================== --- tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java (added) +++ tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLIBatchIntegrationTest.java Wed Apr 1 18:27:23 2015 @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.cli; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.InputStreamReader; +import java.io.OutputStream; +import java.io.PrintStream; +import java.io.Reader; +import java.util.List; + +import org.apache.commons.io.FileUtils; +import org.apache.tika.io.IOUtils; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.serialization.JsonMetadataList; +import org.apache.tika.parser.RecursiveParserWrapper; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class TikaCLIBatchIntegrationTest { + + private File testDataFile = new File("src/test/resources/test-data"); + + private File tempDir; + private OutputStream out = null; + private OutputStream err = null; + private ByteArrayOutputStream outBuffer = null; + + @Before + public void setup() throws Exception { + tempDir = File.createTempFile("tika-cli-test-batch-", ""); + tempDir.delete(); + tempDir.mkdir(); + outBuffer = new ByteArrayOutputStream(); + PrintStream outWriter = new PrintStream(outBuffer, true, IOUtils.UTF_8.name()); + ByteArrayOutputStream errBuffer = new ByteArrayOutputStream(); + PrintStream errWriter = new PrintStream(errBuffer, true, IOUtils.UTF_8.name()); + out = System.out; + err = System.err; + System.setOut(outWriter); + System.setErr(errWriter); + } + + @After + public void tearDown() throws Exception { + System.setOut(new PrintStream(out, true, IOUtils.UTF_8.name())); + System.setErr(new PrintStream(err, true, IOUtils.UTF_8.name())); + FileUtils.deleteDirectory(tempDir); + } + + @Test + public void testSimplestBatchIntegration() throws Exception { + String[] params = {escape(testDataFile.getAbsolutePath()), + escape(tempDir.getAbsolutePath())}; + TikaCLI.main(params); + + assertTrue("bad_xml.xml.xml", new File(tempDir, "bad_xml.xml.xml").isFile()); + assertTrue("coffee.xls.xml", new File(tempDir, "coffee.xls.xml").exists()); + } + + @Test + public void testBasicBatchIntegration() throws Exception { + String[] params = {"-i", escape(testDataFile.getAbsolutePath()), + "-o", escape(tempDir.getAbsolutePath()), + "-numConsumers", "2" + }; + TikaCLI.main(params); + + assertTrue("bad_xml.xml.xml", new File(tempDir, "bad_xml.xml.xml").isFile()); + assertTrue("coffee.xls.xml", new File(tempDir, "coffee.xls.xml").exists()); + } + + @Test + public void testJsonRecursiveBatchIntegration() throws Exception { + Reader reader = null; + try { + String[] params = {"-i", escape(testDataFile.getAbsolutePath()), + "-o", escape(tempDir.getAbsolutePath()), + "-numConsumers", "10", + "-J", //recursive Json + "-t" //plain text in content + }; + TikaCLI.main(params); + reader = new InputStreamReader( + new FileInputStream(new File(tempDir, "test_recursive_embedded.docx.json")), IOUtils.UTF_8); + List<Metadata> metadataList = JsonMetadataList.fromJson(reader); + assertEquals(12, metadataList.size()); + assertTrue(metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).contains("human events")); + } finally { + IOUtils.closeQuietly(reader); + } + } + + @Test + public void testProcessLogFileConfig() throws Exception { + String[] params = {"-i", escape(testDataFile.getAbsolutePath()), + "-o", escape(tempDir.getAbsolutePath()), + "-numConsumers", "2", + "-JDlog4j.configuration=log4j_batch_process_test.properties"}; + TikaCLI.main(params); + + assertTrue("bad_xml.xml.xml", new File(tempDir, "bad_xml.xml.xml").isFile()); + assertTrue("coffee.xls.xml", new File(tempDir, "coffee.xls.xml").exists()); + String sysOutString = new String(outBuffer.toByteArray(), IOUtils.UTF_8); + assertTrue(sysOutString.contains("MY_CUSTOM_LOG_CONFIG")); + } + + public static String escape(String path) { + if (path.indexOf(' ') > -1) { + return '"' + path + '"'; + } + return path; + } + +} Modified: tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java?rev=1670749&r1=1670748&r2=1670749&view=diff ============================================================================== --- tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java (original) +++ tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java Wed Apr 1 18:27:23 2015 @@ -16,26 +16,17 @@ */ package org.apache.tika.cli; -import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import java.io.ByteArrayOutputStream; import java.io.File; -import java.io.FileInputStream; -import java.io.InputStreamReader; -import java.io.OutputStream; import java.io.PrintStream; -import java.io.Reader; import java.net.URI; -import java.util.List; import org.apache.commons.io.FileUtils; import org.apache.tika.exception.TikaException; import org.apache.tika.io.IOUtils; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.serialization.JsonMetadataList; -import org.apache.tika.parser.RecursiveParserWrapper; import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -387,97 +378,4 @@ public class TikaCLITest { assertTrue(content.contains("\\n\\nembed_4\\n")); assertTrue(content.contains("\\n\\nembed_0")); } - - @Test - public void testSimplestBatchIntegration() throws Exception { - File tempDir = File.createTempFile("tika-cli-test-batch-", ""); - tempDir.delete(); - tempDir.mkdir(); - ByteArrayOutputStream outBuffer = new ByteArrayOutputStream(); - PrintStream writer = new PrintStream(outBuffer, true, IOUtils.UTF_8.name()); - OutputStream os = System.out; - System.setOut(writer); - try { - String[] params = {escape(testDataFile.getAbsolutePath()), - escape(tempDir.getAbsolutePath())}; - TikaCLI.main(params); - - StringBuffer allFiles = new StringBuffer(); - assertTrue("bad_xml.xml.xml", new File(tempDir, "bad_xml.xml.xml").isFile()); - assertTrue("coffee.xls.xml", new File(tempDir, "coffee.xls.xml").exists()); - } finally { - //reset in case something went horribly wrong - System.setOut(new PrintStream(os, true, IOUtils.UTF_8.name())); - FileUtils.deleteDirectory(tempDir); - } - } - - @Test - public void testBasicBatchIntegration() throws Exception { - File tempDir = File.createTempFile("tika-cli-test-batch-", ""); - tempDir.delete(); - tempDir.mkdir(); - ByteArrayOutputStream outBuffer = new ByteArrayOutputStream(); - PrintStream writer = new PrintStream(outBuffer, true, IOUtils.UTF_8.name()); - OutputStream os = System.out; - System.setOut(writer); - try { - String[] params = {"-i", escape(testDataFile.getAbsolutePath()), - "-o", escape(tempDir.getAbsolutePath()), - "-numConsumers", "2", - "-reporterSleepMillis", "100"};//report often to make sure - TikaCLI.main(params); - - StringBuffer allFiles = new StringBuffer(); - assertTrue("bad_xml.xml.xml", new File(tempDir, "bad_xml.xml.xml").isFile()); - assertTrue("coffee.xls.xml", new File(tempDir, "coffee.xls.xml").exists()); - String sysOutString = new String(outBuffer.toByteArray(), IOUtils.UTF_8); - - assertEquals(-1, sysOutString.indexOf("There are 3 file processors still active")); - assertTrue(sysOutString.indexOf("There are 2 file processors") > -1); - } finally { - //reset in case something went horribly wrong - System.setOut(new PrintStream(os, true, IOUtils.UTF_8.name())); - FileUtils.deleteDirectory(tempDir); - } - } - - @Test - public void testJsonRecursiveBatchIntegration() throws Exception { - File tempDir = File.createTempFile("tika-cli-test-batch-", ""); - tempDir.delete(); - tempDir.mkdir(); - ByteArrayOutputStream outBuffer = new ByteArrayOutputStream(); - PrintStream writer = new PrintStream(outBuffer, true, IOUtils.UTF_8.name()); - OutputStream os = System.out; - System.setOut(writer); - Reader reader = null; - try { - String[] params = {"-i", escape(testDataFile.getAbsolutePath()), - "-o", escape(tempDir.getAbsolutePath()), - "-numConsumers", "10", - "-J", //recursive Json - "-t" //plain text in content - }; - TikaCLI.main(params); - reader = new InputStreamReader( - new FileInputStream(new File(tempDir, "test_recursive_embedded.docx.json")), IOUtils.UTF_8); - List<Metadata> metadataList = JsonMetadataList.fromJson(reader); - assertEquals(12, metadataList.size()); - assertTrue(metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).contains("human events")); - } finally { - IOUtils.closeQuietly(reader); - //reset in case something went horribly wrong - System.setOut(new PrintStream(os, true, IOUtils.UTF_8.name())); - FileUtils.deleteDirectory(tempDir); - } - } - - - public static String escape(String path) { - if (path.indexOf(' ') > -1){ - return '"'+path+'"'; - } - return path; - } } Added: tika/trunk/tika-app/src/test/resources/log4j_batch_process_test.properties URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/resources/log4j_batch_process_test.properties?rev=1670749&view=auto ============================================================================== --- tika/trunk/tika-app/src/test/resources/log4j_batch_process_test.properties (added) +++ tika/trunk/tika-app/src/test/resources/log4j_batch_process_test.properties Wed Apr 1 18:27:23 2015 @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#info,debug, error,fatal ... +log4j.rootLogger=info,stdout + +#console +log4j.appender.stdout=org.apache.log4j.ConsoleAppender +log4j.appender.stdout.layout=org.apache.log4j.PatternLayout + + +log4j.appender.stdout.layout.ConversionPattern=MY_CUSTOM_LOG_CONFIG %m%n Modified: tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcess.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcess.java?rev=1670749&r1=1670748&r2=1670749&view=diff ============================================================================== --- tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcess.java (original) +++ tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcess.java Wed Apr 1 18:27:23 2015 @@ -127,15 +127,17 @@ public class BatchProcess implements Cal */ public ParallelFileProcessingResult call() throws InterruptedException { - if (alreadyExecuted) { - throw new IllegalStateException("Can only execute BatchRunner once."); - } - //redirect streams - try { - outputStreamWriter = new PrintStream(System.err, true, IOUtils.UTF_8.toString()); - } catch (IOException e) { - throw new RuntimeException("Can't redirect streams"); - } + if (alreadyExecuted) { + throw new IllegalStateException("Can only execute BatchRunner once."); + } + //redirect streams; all organic warnings should go to System.err; + //System.err should be redirected to System.out + PrintStream sysErr = System.err; + try { + outputStreamWriter = new PrintStream(sysErr, true, IOUtils.UTF_8.toString()); + } catch (IOException e) { + throw new RuntimeException("Can't redirect streams"); + } System.setErr(System.out); ParallelFileProcessingResult result = null; @@ -152,13 +154,13 @@ public class BatchProcess implements Cal TimeoutChecker timeoutChecker = new TimeoutChecker(); try { - startConsumersManager(); - } catch (BatchNoRestartError e) { - return new - ParallelFileProcessingResult(0, 0, 0, - 0, BatchProcessDriverCLI.PROCESS_NO_RESTART_EXIT_CODE, - CAUSE_FOR_TERMINATION.CONSUMERS_MANAGER_DIDNT_INIT_IN_TIME_NO_RESTART.toString()); - + startConsumersManager(); + } catch (BatchNoRestartError e) { + return new + ParallelFileProcessingResult(0, 0, 0, 0, + 0, BatchProcessDriverCLI.PROCESS_NO_RESTART_EXIT_CODE, + CAUSE_FOR_TERMINATION.CONSUMERS_MANAGER_DIDNT_INIT_IN_TIME_NO_RESTART.toString()); + } State state = mainLoop(completionService, timeoutChecker); @@ -198,13 +200,12 @@ public class BatchProcess implements Cal if (futureResult != null) { state.removed++; - IFileProcessorFutureResult result = futureResult.get(); - if (result instanceof FileConsumerFutureResult) { - state.consumersRemoved++; - state.processed += ((FileConsumerFutureResult) result).getFilesProcessed(); - } else if (result instanceof FileResourceCrawlerFutureResult) { - state.crawlersRemoved++; - if (fileResourceCrawler.wasTimedOut()) { + IFileProcessorFutureResult result = futureResult.get(); + if (result instanceof FileConsumerFutureResult) { + state.consumersRemoved++; + } else if (result instanceof FileResourceCrawlerFutureResult) { + state.crawlersRemoved++; + if (fileResourceCrawler.wasTimedOut()) { causeForTermination = CAUSE_FOR_TERMINATION.CRAWLER_TIMED_OUT; break; } @@ -290,13 +291,12 @@ public class BatchProcess implements Cal break; } try { - IFileProcessorFutureResult result = future.get(); - if (result instanceof FileConsumerFutureResult) { - FileConsumerFutureResult consumerResult = (FileConsumerFutureResult) result; - state.processed += consumerResult.getFilesProcessed(); - FileStarted fileStarted = consumerResult.getFileStarted(); - if (fileStarted != null - && fileStarted.getElapsedMillis() > timeoutThresholdMillis) { + IFileProcessorFutureResult result = future.get(); + if (result instanceof FileConsumerFutureResult) { + FileConsumerFutureResult consumerResult = (FileConsumerFutureResult) result; + FileStarted fileStarted = consumerResult.getFileStarted(); + if (fileStarted != null + && fileStarted.getElapsedMillis() > timeoutThresholdMillis) { logger.warn(fileStarted.getResourceId() + "\t caused a file processor to hang or crash. You may need to remove " + "this file from your input set and rerun."); @@ -345,18 +345,23 @@ public class BatchProcess implements Cal "< for " + fs.getElapsedMillis() + " milliseconds after it started." + " This exceeds the maxTimeoutMillis parameter"); } - double elapsed = ((double) new Date().getTime() - (double) state.start) / 1000.0; - return new - ParallelFileProcessingResult(considered, added, state.processed, - elapsed, exitStatus, state.causeForTermination.toString()); - } - - private class State { - long start = -1; - int processed = 0; - int numConsumers = 0; - int numNonConsumers = 0; - int removed = 0; + double elapsed = ((double) new Date().getTime() - (double) state.start) / 1000.0; + int processed = 0; + int numExceptions = 0; + for (FileResourceConsumer c : consumersManager.getConsumers()) { + processed += c.getNumResourcesConsumed(); + numExceptions += c.getNumHandledExceptions(); + } + return new + ParallelFileProcessingResult(considered, added, processed, numExceptions, + elapsed, exitStatus, state.causeForTermination.toString()); + } + + private class State { + long start = -1; + int numConsumers = 0; + int numNonConsumers = 0; + int removed = 0; int consumersRemoved = 0; int crawlersRemoved = 0; CAUSE_FOR_TERMINATION causeForTermination = null; @@ -574,12 +579,14 @@ public class BatchProcess implements Cal } } } - } - - private class TimeoutFutureResult implements IFileProcessorFutureResult { - private final int timedOutCount; - - private TimeoutFutureResult(final int timedOutCount) { + } + + private class TimeoutFutureResult implements IFileProcessorFutureResult { + //used to be used when more than one timeout was allowed + //TODO: get rid of this? + private final int timedOutCount; + + private TimeoutFutureResult(final int timedOutCount) { this.timedOutCount = timedOutCount; } Modified: tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcessDriverCLI.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcessDriverCLI.java?rev=1670749&r1=1670748&r2=1670749&view=diff ============================================================================== --- tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcessDriverCLI.java (original) +++ tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/BatchProcessDriverCLI.java Wed Apr 1 18:27:23 2015 @@ -102,21 +102,21 @@ public class BatchProcessDriverCLI { } public void execute() throws Exception { - - interruptWatcherThread.setDaemon(true); - interruptWatcherThread.start(); - logger.trace("about to start"); - start(); - int loopsAfterRestartMessageReceived = 0; - while (!userInterrupted) { + + interruptWatcherThread.setDaemon(true); + interruptWatcherThread.start(); + logger.info("about to start driver"); + start(); + int loopsAfterRestartMessageReceived = 0; + while (!userInterrupted) { Integer exit = null; - try { - logger.trace("about to check exit value"); - exit = process.exitValue(); - logger.trace("exit value:" + exit); - stop(); - } catch (IllegalThreadStateException e) { - //hasn't exited + try { + logger.trace("about to check exit value"); + exit = process.exitValue(); + logger.info("The child process has finished with an exit value of: "+exit); + stop(); + } catch (IllegalThreadStateException e) { + //hasn't exited logger.trace("process has not exited; IllegalThreadStateException"); } @@ -135,13 +135,13 @@ public class BatchProcessDriverCLI { " exit=" + exit + " receivedRestartMsg=" + receivedRestartMsg); //if we've gotten the message via stdout to restart //but the process hasn't exited yet, give it another - //chance - if (receivedRestartMsg && exit == null) { - loopsAfterRestartMessageReceived++; - logger.trace("Must restart, still not exited; loops after restart: " + - loopsAfterRestartMessageReceived); - continue; - } + //chance + if (receivedRestartMsg && exit == null) { + loopsAfterRestartMessageReceived++; + logger.warn("Must restart, still not exited; loops after restart: " + + loopsAfterRestartMessageReceived); + continue; + } if (loopsAfterRestartMessageReceived > waitNumLoopsAfterRestartmessage) { logger.trace("About to try to restart because:" + " exit=" + exit + " receivedRestartMsg=" + receivedRestartMsg); @@ -153,13 +153,13 @@ public class BatchProcessDriverCLI { } } else if (exit != null && exit != BatchProcessDriverCLI.PROCESS_NO_RESTART_EXIT_CODE && exit != BatchProcessDriverCLI.PROCESS_COMPLETED_SUCCESSFULLY) { - logger.trace("About to try to restart because:" + - " exit=" + exit + " receivedRestartMsg=" + receivedRestartMsg); - - if (exit != null && exit == BatchProcessDriverCLI.PROCESS_RESTART_EXIT_CODE) { - logger.info("Restarting on expected restart code"); - } else { - logger.warn("Restarting on unexpected restart code: "+exit); + logger.trace("About to try to restart because:" + + " exit=" + exit + " receivedRestartMsg=" + receivedRestartMsg); + + if (exit == BatchProcessDriverCLI.PROCESS_RESTART_EXIT_CODE) { + logger.info("Restarting on expected restart code"); + } else { + logger.warn("Restarting on unexpected restart code: "+exit); } boolean restarted = restart(exit, receivedRestartMsg); if (!restarted) { @@ -170,17 +170,18 @@ public class BatchProcessDriverCLI { logger.trace("Will not restart: "+exit); break; } - } - logger.trace("about to call shutdown driver now"); - shutdownDriverNow(); - } - - private void shutdownDriverNow() { - if (process != null) { - for (int i = 0; i < 10; i++) { - - logger.trace("trying to shut down: "+i); - try { + } + logger.trace("about to call shutdown driver now"); + shutdownDriverNow(); + logger.info("Process driver has completed"); + } + + private void shutdownDriverNow() { + if (process != null) { + for (int i = 0; i < 60; i++) { + + logger.trace("trying to shut down: "+i); + try { int exit = process.exitValue(); logger.trace("trying to stop:"+exit); stop(); @@ -192,13 +193,13 @@ public class BatchProcessDriverCLI { try { Thread.sleep(1000); } catch (InterruptedException e) { - //swallow - } - } - logger.error("Process didn't stop after 10 seconds after shutdown. " + - "I am forcefully killing it."); - } - interruptWatcherThread.interrupt(); + //swallow + } + } + logger.error("Process didn't stop after 60 seconds after shutdown. " + + "I am forcefully killing it."); + } + interruptWatcherThread.interrupt(); } public int getNumRestarts() { @@ -260,12 +261,17 @@ public class BatchProcessDriverCLI { interruptWriter = new InterruptWriter(process.getOutputStream()); interruptWriterThread = new Thread(interruptWriter); interruptWriterThread.start(); - - } - - public void setRedirectChildProcessToStdOut(boolean redirectChildProcessToStdOut) { - this.redirectChildProcessToStdOut = redirectChildProcessToStdOut; - } + + } + + /** + * Typically only used for testing. This determines whether or not + * to redirect child process's stdOut to driver's stdout + * @param redirectChildProcessToStdOut should the driver redirect the child's stdout + */ + public void setRedirectChildProcessToStdOut(boolean redirectChildProcessToStdOut) { + this.redirectChildProcessToStdOut = redirectChildProcessToStdOut; + } /** * Class to watch stdin from the driver for anything that is typed. Modified: tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/FileResourceConsumer.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/FileResourceConsumer.java?rev=1670749&r1=1670748&r2=1670749&view=diff ============================================================================== --- tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/FileResourceConsumer.java (original) +++ tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/FileResourceConsumer.java Wed Apr 1 18:27:23 2015 @@ -20,21 +20,24 @@ package org.apache.tika.batch; import javax.xml.stream.XMLOutputFactory; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamWriter; -import java.io.Closeable; -import java.io.Flushable; -import java.io.IOException; -import java.io.PrintWriter; -import java.io.StringWriter; -import java.util.Date; +import java.io.Closeable; +import java.io.Flushable; +import java.io.IOException; +import java.io.InputStream; +import java.io.PrintWriter; +import java.io.StringWriter; +import java.util.Date; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.Callable; -import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; -import org.apache.log4j.Level; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.slf4j.MarkerFactory; +import org.xml.sax.ContentHandler; /** @@ -42,23 +45,29 @@ import org.slf4j.MarkerFactory; * goal of this class is to abstract out the multithreading * and recordkeeping components. * <p/> - */ -public abstract class FileResourceConsumer implements Callable<IFileProcessorFutureResult> { - - private static enum STATE { - NOT_YET_STARTED, - ACTIVELY_CONSUMING, - SWALLOWED_POISON, + */ +public abstract class FileResourceConsumer implements Callable<IFileProcessorFutureResult> { + + private enum STATE { + NOT_YET_STARTED, + ACTIVELY_CONSUMING, + SWALLOWED_POISON, THREAD_INTERRUPTED, EXCEEDED_MAX_CONSEC_WAIT_MILLIS, ASKED_TO_SHUTDOWN, TIMED_OUT, CONSUMER_EXCEPTION, CONSUMER_ERROR, - COMPLETED - } - - public static String TIME_OUT = "timeout"; + COMPLETED + } + + public static String TIMED_OUT = "timed_out"; + public static String OOM = "oom"; + public static String IO_IS = "io_on_inputstream"; + public static String IO_OS = "io_on_outputstream"; + public static String PARSE_ERR = "parse_err"; + public static String PARSE_EX = "parse_ex"; + public static String ELAPSED_MILLIS = "elapsedMS"; private static AtomicInteger numConsumers = new AtomicInteger(-1); @@ -248,34 +257,35 @@ public abstract class FileResourceConsum FileStarted tmp = currentFile; if (tmp == null) { return null; - } - if (tmp.getElapsedMillis() > staleThresholdMillis) { - setEndedState(STATE.TIMED_OUT); - logWithResourceId(Level.FATAL, TIME_OUT, - tmp.getResourceId(), ELAPSED_MILLIS, Long.toString(tmp.getElapsedMillis())); - return tmp; - } - } - return null; - } - - protected void logWithResourceId(Level level, String type, String resourceId, String... attrs) { - logWithResourceId(level, type, resourceId, null, attrs); - } - - /** - * Use this for structured output that captures resourceId and other attributes. - * - * @param level level - * @param type entity name for exception - * @param resourceId resourceId string - * @param t throwable can be null - * @param attrs (array of key0, value0, key1, value1, etc.) - */ - protected void logWithResourceId(Level level, String type, String resourceId, Throwable t, String... attrs) { - - StringWriter writer = new StringWriter(); - try { + } + if (tmp.getElapsedMillis() > staleThresholdMillis) { + setEndedState(STATE.TIMED_OUT); + logger.error("{}", getXMLifiedLogMsg( + TIMED_OUT, + tmp.getResourceId(), + ELAPSED_MILLIS, Long.toString(tmp.getElapsedMillis()))); + return tmp; + } + } + return null; + } + + protected String getXMLifiedLogMsg(String type, String resourceId, String... attrs) { + return getXMLifiedLogMsg(type, resourceId, null, attrs); + } + + /** + * Use this for structured output that captures resourceId and other attributes. + * + * @param type entity name for exception + * @param resourceId resourceId string + * @param t throwable can be null + * @param attrs (array of key0, value0, key1, value1, etc.) + */ + protected String getXMLifiedLogMsg(String type, String resourceId, Throwable t, String... attrs) { + + StringWriter writer = new StringWriter(); + try { XMLStreamWriter xml = xmlOutputFactory.createXMLStreamWriter(writer); xml.writeStartDocument(); xml.writeStartElement(type); @@ -299,23 +309,7 @@ public abstract class FileResourceConsum } catch (XMLStreamException e) { logger.error("error writing xml stream for: " + resourceId, t); } - switch (level.toInt()) { - case Level.FATAL_INT: - logger.error(MarkerFactory.getMarker("FATAL"), writer.toString()); - break; - case Level.ERROR_INT: - logger.error(writer.toString()); - break; - case Level.WARN_INT: - logger.warn(writer.toString()); - break; - case Level.DEBUG_INT : - logger.debug(writer.toString()); - break; - case Level.TRACE_INT : - logger.trace(writer.toString()); - break; - }; + return writer.toString(); } private FileResource getNextFileResource() throws InterruptedException { @@ -388,10 +382,49 @@ public abstract class FileResourceConsum synchronized(lock) { if (currentState == STATE.NOT_YET_STARTED || currentState == STATE.ACTIVELY_CONSUMING || - currentState == STATE.ASKED_TO_SHUTDOWN) { - currentState = cause; - } - } - } - + currentState == STATE.ASKED_TO_SHUTDOWN) { + currentState = cause; + } + } + } + + /** + * Utility method to handle logging equivalently among all + * implementing classes. Use, override or avoid as desired. + * <p> + * This will throw Errors, but it will catch all Exceptions and log them + * @param resourceId resourceId + * @param parser parser to use + * @param is inputStream (will be closed by this method!) + * @param handler handler for the content + * @param m metadata + * @param parseContext parse context + * @throws Throwable + */ + protected void parse(final String resourceId, final Parser parser, InputStream is, + final ContentHandler handler, + final Metadata m, final ParseContext parseContext) throws Throwable { + + try { + parser.parse(is, handler, m, parseContext); + } catch (Throwable t) { + if (t instanceof OutOfMemoryError) { + logger.error(getXMLifiedLogMsg(OOM, + resourceId, t)); + throw t; + } else if (t instanceof Error) { + logger.error(getXMLifiedLogMsg(PARSE_ERR, + resourceId, t)); + throw t; + } else { + //warn, but do not rethrow + logger.warn(getXMLifiedLogMsg(PARSE_EX, + resourceId, t)); + incrementHandledExceptions(); + } + } finally { + close(is); + } + } + } Modified: tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/ParallelFileProcessingResult.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/ParallelFileProcessingResult.java?rev=1670749&r1=1670748&r2=1670749&view=diff ============================================================================== --- tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/ParallelFileProcessingResult.java (original) +++ tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/ParallelFileProcessingResult.java Wed Apr 1 18:27:23 2015 @@ -18,22 +18,26 @@ package org.apache.tika.batch; */ public class ParallelFileProcessingResult { - private final int considered; - private final int added; - private final int consumed; - private final double secondsElapsed; - private final int exitStatus; - private final String causeForTermination; - - public ParallelFileProcessingResult(int considered, int added, int consumed, double secondsElapsed, - int exitStatus, - String causeForTermination) { - this.considered = considered; - this.added = added; - this.consumed = consumed; - this.secondsElapsed = secondsElapsed; - this.exitStatus = exitStatus; - this.causeForTermination = causeForTermination; + private final int considered; + private final int added; + private final int consumed; + private final int numberHandledExceptions; + private final double secondsElapsed; + private final int exitStatus; + private final String causeForTermination; + + public ParallelFileProcessingResult(int considered, int added, + int consumed, int numberHandledExceptions, + double secondsElapsed, + int exitStatus, + String causeForTermination) { + this.considered = considered; + this.added = added; + this.consumed = consumed; + this.numberHandledExceptions = numberHandledExceptions; + this.secondsElapsed = secondsElapsed; + this.exitStatus = exitStatus; + this.causeForTermination = causeForTermination; } /** @@ -75,12 +79,16 @@ public class ParallelFileProcessingResul * @return seconds elapsed since the start of the batch processing */ public double secondsElapsed() { - return secondsElapsed; - } - - /** - * - * @return intendedExitStatus + return secondsElapsed; + } + + public int getNumberHandledExceptions() { + return numberHandledExceptions; + } + + /** + * + * @return intendedExitStatus */ public int getExitStatus() { return exitStatus; @@ -89,12 +97,13 @@ public class ParallelFileProcessingResul @Override public String toString() { return "ParallelFileProcessingResult{" + - "considered=" + considered + - ", added=" + added + - ", consumed=" + consumed + - ", secondsElapsed=" + secondsElapsed + - ", exitStatus=" + exitStatus + - ", causeForTermination='" + causeForTermination + '\'' + + "considered=" + considered + + ", added=" + added + + ", consumed=" + consumed + + ", numberHandledExceptions=" + numberHandledExceptions + + ", secondsElapsed=" + secondsElapsed + + ", exitStatus=" + exitStatus + + ", causeForTermination='" + causeForTermination + '\'' + '}'; } } Modified: tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/AbstractFSConsumer.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/AbstractFSConsumer.java?rev=1670749&r1=1670748&r2=1670749&view=diff ============================================================================== --- tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/AbstractFSConsumer.java (original) +++ tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/AbstractFSConsumer.java Wed Apr 1 18:27:23 2015 @@ -19,13 +19,12 @@ package org.apache.tika.batch.fs; import java.io.IOException; import java.io.InputStream; -import java.io.OutputStream; -import java.util.concurrent.ArrayBlockingQueue; - -import org.apache.log4j.Level; -import org.apache.tika.batch.BatchNoRestartError; -import org.apache.tika.batch.FileResource; -import org.apache.tika.batch.FileResourceConsumer; +import java.io.OutputStream; +import java.util.concurrent.ArrayBlockingQueue; + +import org.apache.tika.batch.BatchNoRestartError; +import org.apache.tika.batch.FileResource; +import org.apache.tika.batch.FileResourceConsumer; import org.apache.tika.batch.OutputStreamFactory; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; @@ -52,54 +51,32 @@ public abstract class AbstractFSConsumer OutputStream os = null; try { os = fsOSFactory.getOutputStream(fileResource.getMetadata()); - } catch (IOException e) { - //This can happen if the disk has run out of space, - //or if there was a failure with mkdirs in fsOSFactory - logWithResourceId(Level.FATAL, "ioe_opening_os", - fileResource.getResourceId(), e); - throw new BatchNoRestartError("IOException trying to open output stream for " + - fileResource.getResourceId() + " :: " + e.getMessage()); - } - return os; - } - - protected InputStream getInputStream(FileResource fileResource) { - InputStream is = null; + } catch (IOException e) { + //This can happen if the disk has run out of space, + //or if there was a failure with mkdirs in fsOSFactory + logger.error("{}", getXMLifiedLogMsg(IO_OS, + fileResource.getResourceId(), e)); + throw new BatchNoRestartError("IOException trying to open output stream for " + + fileResource.getResourceId() + " :: " + e.getMessage()); + } + return os; + } + + /** + * + * @param fileResource + * @return inputStream, can be null if there is an exception opening IS + */ + protected InputStream getInputStream(FileResource fileResource) { + InputStream is = null; try { is = fileResource.openInputStream(); } catch (IOException e) { - logWithResourceId(Level.FATAL, "ioe_opening_is", - fileResource.getResourceId(), e); + logger.warn("{}", getXMLifiedLogMsg(IO_IS, + fileResource.getResourceId(), e)); flushAndClose(is); } - return is; - } - - protected void parse(final String resourceId, final Parser parser, InputStream is, - final ContentHandler handler, - final Metadata m, final ParseContext parseContext) throws Throwable { - - Throwable thrown = null; - try { - parser.parse(is, handler, m, parseContext); - } catch (Throwable t) { - if (t instanceof OutOfMemoryError) { - logWithResourceId(Level.ERROR, "oom", - resourceId, t); - } else if (t instanceof Error) { - logWithResourceId(Level.ERROR, "parse_err", - resourceId, t); - } else { - logWithResourceId(Level.WARN, "parse_ex", - resourceId, t); - incrementHandledExceptions(); - } - thrown = t; - } finally { - close(is); - } - if (thrown != null) { - throw thrown; - } - } -} + return is; + } + +} Modified: tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/BasicTikaFSConsumer.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/BasicTikaFSConsumer.java?rev=1670749&r1=1670748&r2=1670749&view=diff ============================================================================== --- tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/BasicTikaFSConsumer.java (original) +++ tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/BasicTikaFSConsumer.java Wed Apr 1 18:27:23 2015 @@ -19,13 +19,12 @@ package org.apache.tika.batch.fs; import java.io.InputStream; import java.io.OutputStream; -import java.io.UnsupportedEncodingException; -import java.util.concurrent.ArrayBlockingQueue; - -import org.apache.log4j.Level; -import org.apache.tika.batch.FileResource; -import org.apache.tika.batch.OutputStreamFactory; -import org.apache.tika.batch.ParserFactory; +import java.io.UnsupportedEncodingException; +import java.util.concurrent.ArrayBlockingQueue; + +import org.apache.tika.batch.FileResource; +import org.apache.tika.batch.OutputStreamFactory; +import org.apache.tika.batch.ParserFactory; import org.apache.tika.config.TikaConfig; import org.apache.tika.io.IOUtils; import org.apache.tika.parser.ParseContext; @@ -87,14 +86,14 @@ public class BasicTikaFSConsumer extends } ContentHandler handler; try { - handler = contentHandlerFactory.getNewContentHandler(os, getOutputEncoding()); - } catch (UnsupportedEncodingException e) { - incrementHandledExceptions(); - logWithResourceId(Level.FATAL, "output_encoding_ex", - fileResource.getResourceId(), e); - flushAndClose(os); - throw new RuntimeException(e.getMessage()); - } + handler = contentHandlerFactory.getNewContentHandler(os, getOutputEncoding()); + } catch (UnsupportedEncodingException e) { + incrementHandledExceptions(); + logger.error(getXMLifiedLogMsg("output_encoding_ex", + fileResource.getResourceId(), e)); + flushAndClose(os); + throw new RuntimeException(e.getMessage()); + } //now actually call parse! Throwable thrown = null; Modified: tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/FSBatchProcessCLI.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/FSBatchProcessCLI.java?rev=1670749&r1=1670748&r2=1670749&view=diff ============================================================================== --- tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/FSBatchProcessCLI.java (original) +++ tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/FSBatchProcessCLI.java Wed Apr 1 18:27:23 2015 @@ -16,29 +16,24 @@ package org.apache.tika.batch.fs; * See the License for the specific language governing permissions and * limitations under the License. */ - -import java.io.File; -import java.io.IOException; -import java.io.OutputStreamWriter; -import java.util.HashMap; -import java.util.Map; -import java.util.concurrent.ExecutorService; + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; -import org.apache.commons.cli.HelpFormatter; -import org.apache.commons.cli.Option; -import org.apache.commons.cli.Options; -import org.apache.log4j.BasicConfigurator; -import org.apache.log4j.ConsoleAppender; -import org.apache.log4j.Level; -import org.apache.log4j.PatternLayout; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; import org.apache.tika.batch.BatchProcess; import org.apache.tika.batch.BatchProcessDriverCLI; -import org.apache.tika.batch.ParallelFileProcessingResult; +import org.apache.tika.batch.ParallelFileProcessingResult; import org.apache.tika.batch.builders.BatchProcessBuilder; import org.apache.tika.batch.builders.CommandLineParserBuilder; import org.apache.tika.io.IOUtils; @@ -48,39 +43,42 @@ import org.slf4j.LoggerFactory; import org.slf4j.MarkerFactory; public class FSBatchProcessCLI { + public static String FINISHED_STRING = "Main thread in TikaFSBatchCLI has finished processing."; private static Logger logger = LoggerFactory.getLogger(FSBatchProcessCLI.class); private final Options options; public FSBatchProcessCLI(String[] args) throws IOException { - TikaInputStream configIs = null; - try { - configIs = getConfigInputStream(args); - CommandLineParserBuilder builder = new CommandLineParserBuilder(); - options = builder.build(configIs); - } finally { + TikaInputStream configIs = null; + try { + configIs = getConfigInputStream(args, true); + CommandLineParserBuilder builder = new CommandLineParserBuilder(); + options = builder.build(configIs); + } finally { IOUtils.closeQuietly(configIs); } } public void usage() { HelpFormatter helpFormatter = new HelpFormatter(); - helpFormatter.printHelp("tika filesystem batch", options); - } - - private TikaInputStream getConfigInputStream(String[] args) throws IOException { - TikaInputStream is = null; - File batchConfigFile = getConfigFile(args); - if (batchConfigFile != null) { + helpFormatter.printHelp("tika filesystem batch", options); + } + + private TikaInputStream getConfigInputStream(String[] args, boolean logDefault) throws IOException { + TikaInputStream is = null; + File batchConfigFile = getConfigFile(args); + if (batchConfigFile != null) { //this will throw IOException if it can't find a specified config file - //better to throw an exception than silently back off to default. - is = TikaInputStream.get(batchConfigFile); - } else { - logger.info("No config file set via -bc, relying on default-tika-batch-config.xml"); - is = TikaInputStream.get( - FSBatchProcessCLI.class.getResourceAsStream("default-tika-batch-config.xml")); - } + //better to throw an exception than silently back off to default. + is = TikaInputStream.get(batchConfigFile); + } else { + if (logDefault) { + logger.info("No config file set via -bc, relying on default-tika-batch-config.xml"); + } + is = TikaInputStream.get( + FSBatchProcessCLI.class.getResourceAsStream("default-tika-batch-config.xml")); + } return is; } @@ -104,13 +102,13 @@ public class FSBatchProcessCLI { } BatchProcessBuilder b = new BatchProcessBuilder(); - TikaInputStream is = null; - BatchProcess process = null; - try { - is = getConfigInputStream(args); - process = b.build(is, mapArgs); - } finally { - IOUtils.closeQuietly(is); + TikaInputStream is = null; + BatchProcess process = null; + try { + is = getConfigInputStream(args, false); + process = b.build(is, mapArgs); + } finally { + IOUtils.closeQuietly(is); } final Thread mainThread = Thread.currentThread(); @@ -134,22 +132,11 @@ public class FSBatchProcessCLI { } } } - return configFile; - } - - + return configFile; + } + public static void main(String[] args) throws Exception { - //if no log4j config file has been set via - //sysprops, use BasicConfigurator - //TODO: figure out if this can cleanly be moved to pure slf4j? - String log4jFile = System.getProperty("log4j.configuration"); - if (log4jFile == null || log4jFile.trim().length()==0) { - ConsoleAppender appender = new ConsoleAppender(); - appender.setLayout(new PatternLayout("%m%n")); - appender.setWriter(new OutputStreamWriter(System.out, IOUtils.UTF_8.name())); - BasicConfigurator.configure(appender); - org.apache.log4j.Logger.getRootLogger().setLevel(Level.INFO); - } + try{ FSBatchProcessCLI cli = new FSBatchProcessCLI(args); cli.execute(args); Modified: tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java?rev=1670749&r1=1670748&r2=1670749&view=diff ============================================================================== --- tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java (original) +++ tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java Wed Apr 1 18:27:23 2015 @@ -22,10 +22,9 @@ import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.Writer; import java.util.LinkedList; -import java.util.List; +import java.util.List; import java.util.concurrent.ArrayBlockingQueue; -import org.apache.log4j.Level; import org.apache.tika.batch.FileResource; import org.apache.tika.batch.OutputStreamFactory; import org.apache.tika.batch.ParserFactory; @@ -130,14 +129,16 @@ public class RecursiveParserWrapperFSCon Writer writer = null; try { - writer = new OutputStreamWriter(os, getOutputEncoding()); - JsonMetadataList.toJson(metadataList, writer); - } catch (Exception e) { - logWithResourceId(Level.ERROR, "json_ex", - fileResource.getResourceId(), e); - } finally { - flushAndClose(writer); - } + writer = new OutputStreamWriter(os, getOutputEncoding()); + JsonMetadataList.toJson(metadataList, writer); + } catch (Exception e) { + //this is a stop the world kind of thing + logger.error("{}", getXMLifiedLogMsg(IO_OS+"json", + fileResource.getResourceId(), e)); + throw new RuntimeException(e); + } finally { + flushAndClose(writer); + } if (thrown != null) { if (thrown instanceof Error) { Modified: tika/trunk/tika-batch/src/test/resources/log4j.properties URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/resources/log4j.properties?rev=1670749&r1=1670748&r2=1670749&view=diff ============================================================================== --- tika/trunk/tika-batch/src/test/resources/log4j.properties (original) +++ tika/trunk/tika-batch/src/test/resources/log4j.properties Wed Apr 1 18:27:23 2015 @@ -1,8 +1,22 @@ - -log4j.rootLogger=OFF,A1 - -#for debugging -#log4j.rootLogger=TRACE,A1 +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +log4j.rootLogger=OFF + +#for debugging +#log4j.rootLogger=TRACE,A1 log4j.appender.A1=org.apache.log4j.ConsoleAppender Modified: tika/trunk/tika-batch/src/test/resources/log4j_process.properties URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/resources/log4j_process.properties?rev=1670749&r1=1670748&r2=1670749&view=diff ============================================================================== --- tika/trunk/tika-batch/src/test/resources/log4j_process.properties (original) +++ tika/trunk/tika-batch/src/test/resources/log4j_process.properties Wed Apr 1 18:27:23 2015 @@ -1,8 +1,24 @@ - -log4j.rootLogger=OFF,A1 - -#for debugging -#log4j.rootLogger=TRACE,A1 +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#This is used by the batch process; see log4j.properties for the driver + +log4j.rootLogger=OFF + +#for debugging +#log4j.rootLogger=TRACE,A1 log4j.appender.A1=org.apache.log4j.ConsoleAppender