Author: tallison Date: Fri Sep 19 14:00:24 2014 New Revision: 1626221 URL: http://svn.apache.org/r1626221 Log: TIKA-1418 add example for how to dump tika config; and add --config to CLI
Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1626221&r1=1626220&r2=1626221&view=diff ============================================================================== --- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java (original) +++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Fri Sep 19 14:00:24 2014 @@ -16,37 +16,6 @@ */ package org.apache.tika.cli; -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.io.OutputStreamWriter; -import java.io.PrintStream; -import java.io.PrintWriter; -import java.io.UnsupportedEncodingException; -import java.io.Writer; -import java.lang.reflect.Field; -import java.net.ServerSocket; -import java.net.Socket; -import java.net.URI; -import java.net.URL; -import java.nio.charset.Charset; -import java.util.Arrays; -import java.util.Comparator; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Set; -import javax.xml.transform.OutputKeys; -import javax.xml.transform.TransformerConfigurationException; -import javax.xml.transform.sax.SAXTransformerFactory; -import javax.xml.transform.sax.TransformerHandler; -import javax.xml.transform.stream.StreamResult; - import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.log4j.BasicConfigurator; @@ -68,9 +37,9 @@ import org.apache.tika.extractor.Embedde import org.apache.tika.fork.ForkParser; import org.apache.tika.gui.TikaGUI; import org.apache.tika.io.CloseShieldInputStream; +import org.apache.tika.io.FilenameUtils; import org.apache.tika.io.IOUtils; import org.apache.tika.io.TikaInputStream; -import org.apache.tika.io.json.JsonMetadataSerializer; import org.apache.tika.language.LanguageProfilerBuilder; import org.apache.tika.language.ProfilingHandler; import org.apache.tika.metadata.Metadata; @@ -92,7 +61,37 @@ import org.apache.tika.xmp.XMPMetadata; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; -import org.apache.tika.io.FilenameUtils; + +import javax.xml.transform.OutputKeys; +import javax.xml.transform.TransformerConfigurationException; +import javax.xml.transform.sax.SAXTransformerFactory; +import javax.xml.transform.sax.TransformerHandler; +import javax.xml.transform.stream.StreamResult; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.io.PrintStream; +import java.io.PrintWriter; +import java.io.UnsupportedEncodingException; +import java.io.Writer; +import java.lang.reflect.Field; +import java.net.ServerSocket; +import java.net.Socket; +import java.net.URI; +import java.net.URL; +import java.nio.charset.Charset; +import java.util.Arrays; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; /** * Simple command line interface for Apache Tika. @@ -279,6 +278,8 @@ public class TikaCLI { private Parser parser; + private String configFilePath; + private OutputType type = XML; private LanguageProfilerBuilder ngp = null; @@ -326,7 +327,11 @@ public class TikaCLI { Logger.getRootLogger().setLevel(Level.DEBUG); } else if (arg.equals("-g") || arg.equals("--gui")) { pipeMode = false; - TikaGUI.main(new String[0]); + if (configFilePath != null){ + TikaGUI.main(new String[]{configFilePath}); + } else { + TikaGUI.main(new String[0]); + } } else if (arg.equals("--list-parser") || arg.equals("--list-parsers")) { pipeMode = false; displayParsers(false, false); @@ -350,6 +355,8 @@ public class TikaCLI { // ignore, as container-aware detectors are now always used } else if (arg.equals("-f") || arg.equals("--fork")) { fork = true; + } else if (arg.startsWith("--config=")) { + configure(arg.substring("--config=".length())); } else if (arg.startsWith("-e")) { encoding = arg.substring("-e".length()); } else if (arg.startsWith("--encoding=")) { @@ -441,6 +448,9 @@ public class TikaCLI { out.println(" -s or --server Start the Apache Tika server"); out.println(" -f or --fork Use Fork Mode for out-of-process extraction"); out.println(); + out.println(" --config=<tika-config.xml>"); + out.println(" TikaConfig file. Must be specified before -g, -s or -f!"); + out.println(""); out.println(" -x or --xml Output XHTML content (default)"); out.println(" -h or --html Output HTML content"); out.println(" -t or --text Output plain text content"); @@ -504,6 +514,15 @@ public class TikaCLI { System.out.println(new Tika().toString()); } + + private void configure(String configFilePath) throws Exception { + this.configFilePath = configFilePath; + TikaConfig config = new TikaConfig(new File(configFilePath)); + parser = new AutoDetectParser(config); + detector = config.getDetector(); + context.set(Parser.class, parser); + } + private void displayMetModels(){ Class<?>[] modelClasses = Metadata.class.getInterfaces(); Arrays.sort(modelClasses, new Comparator<Class<?>>() { Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java?rev=1626221&r1=1626220&r2=1626221&view=diff ============================================================================== --- tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java (original) +++ tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java Fri Sep 19 14:00:24 2014 @@ -16,27 +16,26 @@ */ package org.apache.tika.gui; -import java.awt.CardLayout; -import java.awt.Color; -import java.awt.Dimension; -import java.awt.Toolkit; -import java.awt.event.ActionEvent; -import java.awt.event.ActionListener; -import java.awt.event.KeyEvent; -import java.awt.event.WindowEvent; -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.PrintWriter; -import java.io.StringWriter; -import java.io.Writer; -import java.net.MalformedURLException; -import java.net.URL; -import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; -import java.util.Set; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.DocumentSelector; +import org.apache.tika.io.IOUtils; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.html.BoilerpipeContentHandler; +import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.ContentHandlerDecorator; +import org.apache.tika.sax.TeeContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; import javax.swing.Box; import javax.swing.JDialog; @@ -61,26 +60,27 @@ import javax.xml.transform.TransformerCo import javax.xml.transform.sax.SAXTransformerFactory; import javax.xml.transform.sax.TransformerHandler; import javax.xml.transform.stream.StreamResult; - -import org.apache.tika.exception.TikaException; -import org.apache.tika.extractor.DocumentSelector; -import org.apache.tika.io.IOUtils; -import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.AbstractParser; -import org.apache.tika.parser.AutoDetectParser; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.Parser; -import org.apache.tika.parser.html.BoilerpipeContentHandler; -import org.apache.tika.sax.BodyContentHandler; -import org.apache.tika.sax.ContentHandlerDecorator; -import org.apache.tika.sax.TeeContentHandler; -import org.apache.tika.sax.XHTMLContentHandler; -import org.xml.sax.Attributes; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.AttributesImpl; +import java.awt.CardLayout; +import java.awt.Color; +import java.awt.Dimension; +import java.awt.Toolkit; +import java.awt.event.ActionEvent; +import java.awt.event.ActionListener; +import java.awt.event.KeyEvent; +import java.awt.event.WindowEvent; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.PrintWriter; +import java.io.StringWriter; +import java.io.Writer; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; /** * Simple Swing GUI for Apache Tika. You can drag and drop files on top @@ -103,10 +103,16 @@ public class TikaGUI extends JFrame * @throws Exception if an error occurs */ public static void main(String[] args) throws Exception { + TikaConfig config = TikaConfig.getDefaultConfig(); + if (args.length > 0) { + File configFile = new File(args[0]); + config = new TikaConfig(configFile); + } UIManager.setLookAndFeel(UIManager.getSystemLookAndFeelClassName()); + final TikaConfig finalConfig = config; SwingUtilities.invokeLater(new Runnable() { public void run() { - new TikaGUI(new AutoDetectParser()).setVisible(true); + new TikaGUI(new AutoDetectParser(finalConfig)).setVisible(true); } }); } Modified: tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java?rev=1626221&r1=1626220&r2=1626221&view=diff ============================================================================== --- tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java (original) +++ tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java Fri Sep 19 14:00:24 2014 @@ -16,19 +16,19 @@ */ package org.apache.tika.cli; +import org.apache.commons.io.FileUtils; +import org.apache.tika.exception.TikaException; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + import java.io.ByteArrayOutputStream; import java.io.File; import java.io.PrintStream; import java.net.URI; -import java.util.Locale; - -import org.apache.commons.io.FileUtils; -import org.junit.After; -import static org.junit.Assert.assertTrue; import static org.junit.Assert.assertFalse; -import org.junit.Before; -import org.junit.Test; +import static org.junit.Assert.assertTrue; /** * Tests the Tika's cli @@ -39,7 +39,8 @@ public class TikaCLITest { private File profile = null; private ByteArrayOutputStream outContent = null; private PrintStream stdout = null; - private URI testDataURI = new File("src/test/resources/test-data/").toURI(); + private File testDataFile = new File("src/test/resources/test-data"); + private URI testDataURI = testDataFile.toURI(); private String resourcePrefix; @Before @@ -296,4 +297,28 @@ public class TikaCLITest { new File("target/subdir/foo.txt").delete(); new File("target/subdir").delete(); } + + @Test + public void testDefaultConfigException() throws Exception { + //default xml parser will throw TikaException + //this and TestConfig() are broken into separate tests so that + //setUp and tearDown() are called each time + String[] params = {resourcePrefix + "bad_xml.xml"}; + boolean tikaEx = false; + try { + TikaCLI.main(params); + } catch (TikaException e) { + tikaEx = true; + } + assertTrue(tikaEx); + } + + @Test + public void testConfig() throws Exception { + String[] params = new String[]{"--config="+testDataFile.toString()+"/tika-config1.xml", resourcePrefix+"bad_xml.xml"}; + TikaCLI.main(params); + String content = outContent.toString("UTF-8"); + assertTrue(content.contains("apple")); + assertTrue(content.contains("org.apache.tika.parser.html.HtmlParser")); + } } Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java?rev=1626221&r1=1626220&r2=1626221&view=diff ============================================================================== --- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java (original) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java Fri Sep 19 14:00:24 2014 @@ -16,16 +16,6 @@ */ package org.apache.tika.parser; -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; - import org.apache.tika.exception.TikaException; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; @@ -36,6 +26,16 @@ import org.apache.tika.sax.TaggedContent import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + /** * Composite parser that delegates parsing tasks to a component parser * based on the declared content type of the incoming document. A fallback @@ -203,7 +203,6 @@ public class CompositeParser extends Abs // We always work on the normalised, canonical form type = registry.normalize(type); } - while (type != null) { // Try finding a parser for the type Parser parser = map.get(type); @@ -239,7 +238,11 @@ public class CompositeParser extends Abs TikaInputStream taggedStream = TikaInputStream.get(stream, tmp); TaggedContentHandler taggedHandler = handler != null ? new TaggedContentHandler(handler) : null; - metadata.add("X-Parsed-By", parser.getClass().getName()); + if (parser instanceof ParserDecorator){ + metadata.add("X-Parsed-By", ((ParserDecorator) parser).getWrappedParser().getClass().getName()); + } else { + metadata.add("X-Parsed-By", parser.getClass().getName()); + } try { parser.parse(taggedStream, taggedHandler, metadata, context); } catch (RuntimeException e) {