Author: tallison Date: Tue Mar 31 01:54:40 2015 New Revision: 1670237 URL: http://svn.apache.org/r1670237 Log: TIKA-1330: add integration tests to TikaCLITest
Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/SimpleLogReporterBuilder.java tika/trunk/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml tika/trunk/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1670237&r1=1670236&r2=1670237&view=diff ============================================================================== --- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java (original) +++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Tue Mar 31 01:54:40 2015 @@ -125,7 +125,7 @@ public class TikaCLI { String[] batchArgs = BatchCommandLineBuilder.build(args); BatchProcessDriverCLI batchDriver = new BatchProcessDriverCLI(batchArgs); batchDriver.execute(); - System.exit(0); + return; } BasicConfigurator.configure( Modified: tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java?rev=1670237&r1=1670236&r2=1670237&view=diff ============================================================================== --- tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java (original) +++ tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java Tue Mar 31 01:54:40 2015 @@ -16,16 +16,26 @@ */ package org.apache.tika.cli; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import java.io.ByteArrayOutputStream; import java.io.File; +import java.io.FileInputStream; +import java.io.InputStreamReader; +import java.io.OutputStream; import java.io.PrintStream; +import java.io.Reader; import java.net.URI; +import java.util.List; + import org.apache.commons.io.FileUtils; import org.apache.tika.exception.TikaException; import org.apache.tika.io.IOUtils; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.serialization.JsonMetadataList; +import org.apache.tika.parser.RecursiveParserWrapper; import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -378,4 +388,96 @@ public class TikaCLITest { assertTrue(content.contains("\\n\\nembed_0")); } + @Test + public void testSimplestBatchIntegration() throws Exception { + File tempDir = File.createTempFile("tika-cli-test-batch-", ""); + tempDir.delete(); + tempDir.mkdir(); + ByteArrayOutputStream outBuffer = new ByteArrayOutputStream(); + PrintStream writer = new PrintStream(outBuffer, true, IOUtils.UTF_8.name()); + OutputStream os = System.out; + System.setOut(writer); + try { + String[] params = {escape(testDataFile.getAbsolutePath()), + escape(tempDir.getAbsolutePath())}; + TikaCLI.main(params); + + StringBuffer allFiles = new StringBuffer(); + assertTrue("bad_xml.xml.xml", new File(tempDir, "bad_xml.xml.xml").isFile()); + assertTrue("coffee.xls.xml", new File(tempDir, "coffee.xls.xml").exists()); + } finally { + //reset in case something went horribly wrong + System.setOut(new PrintStream(os, true, IOUtils.UTF_8.name())); + FileUtils.deleteDirectory(tempDir); + } + } + + @Test + public void testBasicBatchIntegration() throws Exception { + File tempDir = File.createTempFile("tika-cli-test-batch-", ""); + tempDir.delete(); + tempDir.mkdir(); + ByteArrayOutputStream outBuffer = new ByteArrayOutputStream(); + PrintStream writer = new PrintStream(outBuffer, true, IOUtils.UTF_8.name()); + OutputStream os = System.out; + System.setOut(writer); + try { + String[] params = {"-i", escape(testDataFile.getAbsolutePath()), + "-o", escape(tempDir.getAbsolutePath()), + "-numConsumers", "2", + "-reporterSleepMillis", "100"};//report often to make sure + TikaCLI.main(params); + + StringBuffer allFiles = new StringBuffer(); + assertTrue("bad_xml.xml.xml", new File(tempDir, "bad_xml.xml.xml").isFile()); + assertTrue("coffee.xls.xml", new File(tempDir, "coffee.xls.xml").exists()); + String sysOutString = new String(outBuffer.toByteArray(), IOUtils.UTF_8); + + assertEquals(-1, sysOutString.indexOf("There are 3 file processors still active")); + assertTrue(sysOutString.indexOf("There are 2 file processors") > -1); + } finally { + //reset in case something went horribly wrong + System.setOut(new PrintStream(os, true, IOUtils.UTF_8.name())); + FileUtils.deleteDirectory(tempDir); + } + } + + @Test + public void testJsonRecursiveBatchIntegration() throws Exception { + File tempDir = File.createTempFile("tika-cli-test-batch-", ""); + tempDir.delete(); + tempDir.mkdir(); + ByteArrayOutputStream outBuffer = new ByteArrayOutputStream(); + PrintStream writer = new PrintStream(outBuffer, true, IOUtils.UTF_8.name()); + OutputStream os = System.out; + System.setOut(writer); + Reader reader = null; + try { + String[] params = {"-i", escape(testDataFile.getAbsolutePath()), + "-o", escape(tempDir.getAbsolutePath()), + "-numConsumers", "10", + "-J", //recursive Json + "-t" //plain text in content + }; + TikaCLI.main(params); + reader = new InputStreamReader( + new FileInputStream(new File(tempDir, "test_recursive_embedded.docx.json")), IOUtils.UTF_8); + List<Metadata> metadataList = JsonMetadataList.fromJson(reader); + assertEquals(12, metadataList.size()); + assertTrue(metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).contains("human events")); + } finally { + IOUtils.closeQuietly(reader); + //reset in case something went horribly wrong + System.setOut(new PrintStream(os, true, IOUtils.UTF_8.name())); + FileUtils.deleteDirectory(tempDir); + } + } + + + public static String escape(String path) { + if (path.indexOf(' ') > -1){ + return '"'+path+'"'; + } + return path; + } } Modified: tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/SimpleLogReporterBuilder.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/SimpleLogReporterBuilder.java?rev=1670237&r1=1670236&r2=1670237&view=diff ============================================================================== --- tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/SimpleLogReporterBuilder.java (original) +++ tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/SimpleLogReporterBuilder.java Tue Mar 31 01:54:40 2015 @@ -30,13 +30,13 @@ public class SimpleLogReporterBuilder im @Override public StatusReporter build(FileResourceCrawler crawler, ConsumersManager consumersManager, - Node n, Map<String, String> commandlineArguments) { - - Map<String, String> attributes = XMLDOMUtil.mapifyAttrs(n, commandlineArguments); - long sleepMillis = PropsUtil.getLong(attributes.get("sleepMillis"), 1000L); - long staleThresholdMillis = PropsUtil.getLong(attributes.get("reporterStaleThresholdMillis"), 500000L); - StatusReporter reporter = new StatusReporter(crawler, consumersManager); - reporter.setSleepMillis(sleepMillis); + Node n, Map<String, String> commandlineArguments) { + + Map<String, String> attributes = XMLDOMUtil.mapifyAttrs(n, commandlineArguments); + long sleepMillis = PropsUtil.getLong(attributes.get("reporterSleepMillis"), 1000L); + long staleThresholdMillis = PropsUtil.getLong(attributes.get("reporterStaleThresholdMillis"), 500000L); + StatusReporter reporter = new StatusReporter(crawler, consumersManager); + reporter.setSleepMillis(sleepMillis); reporter.setStaleThresholdMillis(staleThresholdMillis); return reporter; } Modified: tika/trunk/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml?rev=1670237&r1=1670236&r2=1670237&view=diff ============================================================================== --- tika/trunk/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml (original) +++ tika/trunk/tika-batch/src/main/resources/org/apache/tika/batch/fs/default-tika-batch-config.xml Tue Mar 31 01:54:40 2015 @@ -26,13 +26,13 @@ <tika-batch-config maxAliveTimeSeconds="-1" pauseOnEarlyTerminationMillis="10000" - timeoutThresholdMillis="300000" - timeoutCheckPulseMillis="1000" - maxQueueSize="10000" - numConsumers="5"> - - <!-- options to allow on the commandline --> - <commandline> + timeoutThresholdMillis="300000" + timeoutCheckPulseMillis="1000" + maxQueueSize="10000" + numConsumers="default"> <!-- numConsumers = number of file consumers, "default" = number of processors -1 --> + + <!-- options to allow on the commandline --> + <commandline> <option opt="c" longOpt="tika-config" hasArg="true" description="TikaConfig file"/> <option opt="bc" longOpt="batch-config" hasArg="true" @@ -72,12 +72,14 @@ <option opt="timeoutThresholdMillis" hasArg="true" description="how long to wait before determining that a consumer is stale"/> <option opt="includeFilePat" hasArg="true" - description="regex that specifies which files to process"/> - <option opt="excludeFilePat" hasArg="true" - description="regex that specifies which files to avoid processing"/> - </commandline> - - + description="regex that specifies which files to process"/> + <option opt="excludeFilePat" hasArg="true" + description="regex that specifies which files to avoid processing"/> + <option opt="reporterSleepMillis" hasArg="true" + description="millisecond between reports by the reporter"/> + </commandline> + + <!-- can specify inputDir="input", but the default config should not include this --> <!-- can also specify startDir="input/someDir" to specify which child directory to start processing --> @@ -111,10 +113,10 @@ <!-- overwritePolicy: "skip" a file if output file exists, "rename" a output file, "overwrite" --> <!-- can include e.g. outputDir="output", but we don't want to include this in the default! --> <outputstream class="FSOutputStreamFactory" encoding="UTF-8" outputSuffix="xml"/> - </consumers> - - <!-- reporter and interrupter are optional --> - <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" sleepMillis="1000" - reporterStaleThresholdMillis="60000"/> - <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/> + </consumers> + + <!-- reporter and interrupter are optional --> + <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" reporterSleepMillis="1000" + reporterStaleThresholdMillis="60000"/> + <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/> </tika-batch-config> \ No newline at end of file Modified: tika/trunk/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml?rev=1670237&r1=1670236&r2=1670237&view=diff ============================================================================== --- tika/trunk/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml (original) +++ tika/trunk/tika-batch/src/test/resources/tika-batch-config-MockConsumersBuilder.xml Tue Mar 31 01:54:40 2015 @@ -103,10 +103,10 @@ <outputstream class="FSOutputStreamFactory" encoding="UTF-8" outputSuffix="xml"/> - </consumers> - - <!-- reporter and interrupter are optional --> - <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" sleepMillis="1000" - reporterStaleThresholdMillis="500000"/> - <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/> + </consumers> + + <!-- reporter and interrupter are optional --> + <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" reporterSleepMillis="1000" + reporterStaleThresholdMillis="500000"/> + <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/> </tika-batch-config> \ No newline at end of file Modified: tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml?rev=1670237&r1=1670236&r2=1670237&view=diff ============================================================================== --- tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml (original) +++ tika/trunk/tika-batch/src/test/resources/tika-batch-config-broken.xml Tue Mar 31 01:54:40 2015 @@ -96,10 +96,10 @@ <outputstream class="FSOutputStreamFactory" encoding="UTF-8" outputSuffix="xml"/> - </consumers> - - <!-- reporter and interrupter are optional --> - <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" sleepMillis="1000" - reporterStaleThresholdMillis="500000"/> - <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/> + </consumers> + + <!-- reporter and interrupter are optional --> + <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" reporterSleepMillis="1000" + reporterStaleThresholdMillis="500000"/> + <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/> </tika-batch-config> \ No newline at end of file Modified: tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml?rev=1670237&r1=1670236&r2=1670237&view=diff ============================================================================== --- tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml (original) +++ tika/trunk/tika-batch/src/test/resources/tika-batch-config-test.xml Tue Mar 31 01:54:40 2015 @@ -102,10 +102,10 @@ <outputstream class="FSOutputStreamFactory" encoding="UTF-8" outputSuffix="xml"/> - </consumers> - - <!-- reporter and interrupter are optional --> - <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" sleepMillis="1000" - reporterStaleThresholdMillis="500000"/> - <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/> + </consumers> + + <!-- reporter and interrupter are optional --> + <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" reporterSleepMillis="1000" + reporterStaleThresholdMillis="500000"/> + <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/> </tika-batch-config> \ No newline at end of file