Author: nick
Date: Mon Jul 26 12:19:02 2010
New Revision: 979258
URL: http://svn.apache.org/viewvc?rev=979258&view=rev
Log:
TIKA-470 - New tika-app option to list the supported parsers, and their mime
types, via options of --list-parsers and --list-parser-details
Modified:
tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=979258&r1=979257&r2=979258&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
(original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Mon Jul
26 12:19:02 2010
@@ -24,7 +24,15 @@ import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.net.URL;
+import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.Map.Entry;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.TransformerConfigurationException;
@@ -40,6 +48,7 @@ import org.apache.log4j.WriterAppender;
import org.apache.tika.gui.TikaGUI;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.MetadataHelper;
+import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
@@ -115,7 +124,7 @@ public class TikaCLI {
private ParseContext context;
- private Parser parser;
+ private AutoDetectParser parser;
private Metadata metadata;
@@ -143,6 +152,12 @@ public class TikaCLI {
} else if (arg.equals("-g") || arg.equals("--gui")) {
pipeMode = false;
TikaGUI.main(new String[0]);
+ } else if (arg.equals("--list-parser") ||
arg.equals("--list-parsers")) {
+ pipeMode = false;
+ displayParsers(false);
+ } else if (arg.equals("--list-parser-detail") ||
arg.equals("--list-parser-details")) {
+ pipeMode = false;
+ displayParsers(true);
} else if (arg.startsWith("-e")) {
encoding = arg.substring("-e".length());
} else if (arg.startsWith("--encoding=")) {
@@ -200,6 +215,10 @@ public class TikaCLI {
out.println(" -t or --text Output plain text content");
out.println(" -T or --text-main Output plain text content (main
content only)");
out.println(" -m or --metadata Output only metadata");
+ out.println(" --list-parsers");
+ out.println(" List the available document parsers");
+ out.println(" --list-parser-details");
+ out.println(" List the available document parsers, and their
supported mime types");
out.println();
out.println("Description:");
out.println(" Apache Tika will parse the file(s) specified on the");
@@ -219,6 +238,40 @@ public class TikaCLI {
out.println(" extract text content and metadata from the files.");
}
+ private void displayParsers(boolean includeMimeTypes) {
+ PrintStream out = System.out;
+
+ // Invert the map
+ Map<MediaType,Parser> supported = parser.getParsers();
+ Map<Parser,Set<MediaType>> parsers = new HashMap<Parser,
Set<MediaType>>();
+ for(Entry<MediaType, Parser> e : supported.entrySet()) {
+ if(! parsers.containsKey(e.getValue())) {
+ parsers.put(e.getValue(), new HashSet<MediaType>());
+ }
+ parsers.get(e.getValue()).add(e.getKey());
+ }
+
+ // Get a nicely sorted list of the parsers
+ Parser[] sortedParsers = parsers.keySet().toArray(new
Parser[parsers.size()]);
+ Arrays.sort(sortedParsers, new Comparator<Parser>() {
+ public int compare(Parser p1, Parser p2) {
+ String name1 = p1.getClass().getName();
+ String name2 = p2.getClass().getName();
+ return name1.compareTo(name2);
+ }
+ });
+
+ // Display
+ for(Parser p : sortedParsers) {
+ out.println(p.getClass().getName());
+ if(includeMimeTypes) {
+ for(MediaType mt : parsers.get(p)) {
+ out.println(" " + mt);
+ }
+ }
+ }
+ }
+
/**
* Returns a {...@link System#out} writer with the given output encoding.
*