Author: tallison
Date: Fri Sep 19 14:00:24 2014
New Revision: 1626221

URL: http://svn.apache.org/r1626221
Log:
TIKA-1418 add example for how to dump tika config; and add --config to CLI

Modified:
    tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
    tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
    tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
    
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java

Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1626221&r1=1626220&r2=1626221&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java 
(original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Fri Sep 
19 14:00:24 2014
@@ -16,37 +16,6 @@
  */
 package org.apache.tika.cli;
 
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.OutputStreamWriter;
-import java.io.PrintStream;
-import java.io.PrintWriter;
-import java.io.UnsupportedEncodingException;
-import java.io.Writer;
-import java.lang.reflect.Field;
-import java.net.ServerSocket;
-import java.net.Socket;
-import java.net.URI;
-import java.net.URL;
-import java.nio.charset.Charset;
-import java.util.Arrays;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-import java.util.Map.Entry;
-import java.util.Set;
-import javax.xml.transform.OutputKeys;
-import javax.xml.transform.TransformerConfigurationException;
-import javax.xml.transform.sax.SAXTransformerFactory;
-import javax.xml.transform.sax.TransformerHandler;
-import javax.xml.transform.stream.StreamResult;
-
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.log4j.BasicConfigurator;
@@ -68,9 +37,9 @@ import org.apache.tika.extractor.Embedde
 import org.apache.tika.fork.ForkParser;
 import org.apache.tika.gui.TikaGUI;
 import org.apache.tika.io.CloseShieldInputStream;
+import org.apache.tika.io.FilenameUtils;
 import org.apache.tika.io.IOUtils;
 import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.io.json.JsonMetadataSerializer;
 import org.apache.tika.language.LanguageProfilerBuilder;
 import org.apache.tika.language.ProfilingHandler;
 import org.apache.tika.metadata.Metadata;
@@ -92,7 +61,37 @@ import org.apache.tika.xmp.XMPMetadata;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
-import org.apache.tika.io.FilenameUtils;
+
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.TransformerConfigurationException;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+import javax.xml.transform.stream.StreamResult;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.PrintStream;
+import java.io.PrintWriter;
+import java.io.UnsupportedEncodingException;
+import java.io.Writer;
+import java.lang.reflect.Field;
+import java.net.ServerSocket;
+import java.net.Socket;
+import java.net.URI;
+import java.net.URL;
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
 
 /**
  * Simple command line interface for Apache Tika.
@@ -279,6 +278,8 @@ public class TikaCLI {
 
     private Parser parser;
 
+    private String configFilePath;
+
     private OutputType type = XML;
     
     private LanguageProfilerBuilder ngp = null;
@@ -326,7 +327,11 @@ public class TikaCLI {
             Logger.getRootLogger().setLevel(Level.DEBUG);
         } else if (arg.equals("-g") || arg.equals("--gui")) {
             pipeMode = false;
-            TikaGUI.main(new String[0]);
+            if (configFilePath != null){
+                TikaGUI.main(new String[]{configFilePath});
+            } else {
+                TikaGUI.main(new String[0]);
+            }
         } else if (arg.equals("--list-parser") || 
arg.equals("--list-parsers")) {
             pipeMode = false;
             displayParsers(false, false);
@@ -350,6 +355,8 @@ public class TikaCLI {
             // ignore, as container-aware detectors are now always used
         } else if (arg.equals("-f") || arg.equals("--fork")) {
             fork = true;
+        } else if (arg.startsWith("--config=")) {
+            configure(arg.substring("--config=".length()));
         } else if (arg.startsWith("-e")) {
             encoding = arg.substring("-e".length());
         } else if (arg.startsWith("--encoding=")) {
@@ -441,6 +448,9 @@ public class TikaCLI {
         out.println("    -s  or --server        Start the Apache Tika server");
         out.println("    -f  or --fork          Use Fork Mode for 
out-of-process extraction");
         out.println();
+        out.println("    --config=<tika-config.xml>");
+        out.println("        TikaConfig file. Must be specified before -g, -s 
or -f!");
+        out.println("");
         out.println("    -x  or --xml           Output XHTML content 
(default)");
         out.println("    -h  or --html          Output HTML content");
         out.println("    -t  or --text          Output plain text content");
@@ -504,6 +514,15 @@ public class TikaCLI {
         System.out.println(new Tika().toString());
     }
 
+
+    private void configure(String configFilePath) throws Exception {
+        this.configFilePath = configFilePath;
+        TikaConfig config = new TikaConfig(new File(configFilePath));
+        parser = new AutoDetectParser(config);
+        detector = config.getDetector();
+        context.set(Parser.class, parser);
+    }
+
     private void displayMetModels(){
         Class<?>[] modelClasses = Metadata.class.getInterfaces();
         Arrays.sort(modelClasses, new Comparator<Class<?>>() {

Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java?rev=1626221&r1=1626220&r2=1626221&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java 
(original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java Fri Sep 
19 14:00:24 2014
@@ -16,27 +16,26 @@
  */
 package org.apache.tika.gui;
 
-import java.awt.CardLayout;
-import java.awt.Color;
-import java.awt.Dimension;
-import java.awt.Toolkit;
-import java.awt.event.ActionEvent;
-import java.awt.event.ActionListener;
-import java.awt.event.KeyEvent;
-import java.awt.event.WindowEvent;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.PrintWriter;
-import java.io.StringWriter;
-import java.io.Writer;
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Set;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.DocumentSelector;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.html.BoilerpipeContentHandler;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ContentHandlerDecorator;
+import org.apache.tika.sax.TeeContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
 
 import javax.swing.Box;
 import javax.swing.JDialog;
@@ -61,26 +60,27 @@ import javax.xml.transform.TransformerCo
 import javax.xml.transform.sax.SAXTransformerFactory;
 import javax.xml.transform.sax.TransformerHandler;
 import javax.xml.transform.stream.StreamResult;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.DocumentSelector;
-import org.apache.tika.io.IOUtils;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.html.BoilerpipeContentHandler;
-import org.apache.tika.sax.BodyContentHandler;
-import org.apache.tika.sax.ContentHandlerDecorator;
-import org.apache.tika.sax.TeeContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
+import java.awt.CardLayout;
+import java.awt.Color;
+import java.awt.Dimension;
+import java.awt.Toolkit;
+import java.awt.event.ActionEvent;
+import java.awt.event.ActionListener;
+import java.awt.event.KeyEvent;
+import java.awt.event.WindowEvent;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PrintWriter;
+import java.io.StringWriter;
+import java.io.Writer;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
 
 /**
  * Simple Swing GUI for Apache Tika. You can drag and drop files on top
@@ -103,10 +103,16 @@ public class TikaGUI extends JFrame
      * @throws Exception if an error occurs
      */
     public static void main(String[] args) throws Exception {
+        TikaConfig config = TikaConfig.getDefaultConfig();
+        if (args.length > 0) {
+            File configFile = new File(args[0]);
+            config = new TikaConfig(configFile);
+        }
         UIManager.setLookAndFeel(UIManager.getSystemLookAndFeelClassName());
+        final TikaConfig finalConfig = config;
         SwingUtilities.invokeLater(new Runnable() {
             public void run() {
-                new TikaGUI(new AutoDetectParser()).setVisible(true);
+                new TikaGUI(new 
AutoDetectParser(finalConfig)).setVisible(true);
             }
         });
     }

Modified: tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java?rev=1626221&r1=1626220&r2=1626221&view=diff
==============================================================================
--- tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java 
(original)
+++ tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java Fri 
Sep 19 14:00:24 2014
@@ -16,19 +16,19 @@
  */
 package org.apache.tika.cli;
 
+import org.apache.commons.io.FileUtils;
+import org.apache.tika.exception.TikaException;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
 import java.io.ByteArrayOutputStream;
 import java.io.File;
 import java.io.PrintStream;
 import java.net.URI;
-import java.util.Locale;
-
-import org.apache.commons.io.FileUtils;
 
-import org.junit.After;
-import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.assertFalse;
-import org.junit.Before;
-import org.junit.Test;
+import static org.junit.Assert.assertTrue;
 
 /**
  * Tests the Tika's cli
@@ -39,7 +39,8 @@ public class TikaCLITest {
     private File profile = null;
     private ByteArrayOutputStream outContent = null;
     private PrintStream stdout = null;
-    private URI testDataURI = new 
File("src/test/resources/test-data/").toURI();
+    private File testDataFile = new File("src/test/resources/test-data");
+    private URI testDataURI = testDataFile.toURI();
     private String resourcePrefix;
 
     @Before
@@ -296,4 +297,28 @@ public class TikaCLITest {
         new File("target/subdir/foo.txt").delete();
         new File("target/subdir").delete();
     }
+
+    @Test
+    public void testDefaultConfigException() throws Exception {
+        //default xml parser will throw TikaException
+        //this and TestConfig() are broken into separate tests so that
+        //setUp and tearDown() are called each time
+        String[] params = {resourcePrefix + "bad_xml.xml"};
+        boolean tikaEx = false;
+        try {
+            TikaCLI.main(params);
+        } catch (TikaException e) {
+            tikaEx = true;
+        }
+        assertTrue(tikaEx);
+    }
+
+    @Test
+    public void testConfig() throws Exception {
+        String[] params = new 
String[]{"--config="+testDataFile.toString()+"/tika-config1.xml", 
resourcePrefix+"bad_xml.xml"};
+        TikaCLI.main(params);
+        String content = outContent.toString("UTF-8");
+        assertTrue(content.contains("apple"));
+        assertTrue(content.contains("org.apache.tika.parser.html.HtmlParser"));
+    }
 }

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java?rev=1626221&r1=1626220&r2=1626221&view=diff
==============================================================================
--- 
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java 
(original)
+++ 
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java 
Fri Sep 19 14:00:24 2014
@@ -16,16 +16,6 @@
  */
 package org.apache.tika.parser;
 
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
@@ -36,6 +26,16 @@ import org.apache.tika.sax.TaggedContent
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
 /**
  * Composite parser that delegates parsing tasks to a component parser
  * based on the declared content type of the incoming document. A fallback
@@ -203,7 +203,6 @@ public class CompositeParser extends Abs
            // We always work on the normalised, canonical form
            type = registry.normalize(type);
         }
-        
         while (type != null) {
             // Try finding a parser for the type
             Parser parser = map.get(type);
@@ -239,7 +238,11 @@ public class CompositeParser extends Abs
             TikaInputStream taggedStream = TikaInputStream.get(stream, tmp);
             TaggedContentHandler taggedHandler = 
                 handler != null ? new TaggedContentHandler(handler) : null;
-           metadata.add("X-Parsed-By", parser.getClass().getName());
+            if (parser instanceof ParserDecorator){
+                metadata.add("X-Parsed-By", ((ParserDecorator) 
parser).getWrappedParser().getClass().getName());
+            } else {
+                metadata.add("X-Parsed-By", parser.getClass().getName());
+            }
             try {
                 parser.parse(taggedStream, taggedHandler, metadata, context);
             } catch (RuntimeException e) {


Reply via email to