Author: jukka
Date: Sun Sep 20 22:14:27 2009
New Revision: 817114
URL: http://svn.apache.org/viewvc?rev=817114&view=rev
Log:
TIKA-277: Tika stand alone CLI --possibility to specify output encoding (--text)
Add the output encoding option suggested by Paul Borgermans.
Modified:
lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
Modified:
lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=817114&r1=817113&r2=817114&view=diff
==============================================================================
--- lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
(original)
+++ lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
Sun Sep 20 22:14:27 2009
@@ -19,7 +19,11 @@
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
+import java.io.OutputStreamWriter;
import java.io.PrintStream;
+import java.io.PrintWriter;
+import java.io.UnsupportedEncodingException;
+import java.io.Writer;
import java.net.URL;
import java.util.Arrays;
import java.util.HashMap;
@@ -71,6 +75,11 @@
private ContentHandler handler;
+ /**
+ * Output character encoding, or <code>null</code> for platform default
+ */
+ private String encoding = null;
+
private boolean pipeMode = true;
public TikaCLI() throws TransformerConfigurationException {
@@ -89,6 +98,10 @@
} else if (arg.equals("-g") || arg.equals("--gui")) {
pipeMode = false;
TikaGUI.main(new String[0]);
+ } else if (arg.startsWith("-e")) {
+ encoding = arg.substring("-e".length());
+ } else if (arg.startsWith("--encoding=")) {
+ encoding = arg.substring("--encoding=".length());
} else if (arg.equals("-x") || arg.equals("--xml")) {
handler = getXmlContentHandler();
} else if (arg.equals("-h") || arg.equals("--html")) {
@@ -132,13 +145,14 @@
out.println("usage: tika [option] [file]");
out.println();
out.println("Options:");
- out.println(" -? or --help Print this usage message");
- out.println(" -v or --verbose Print debug level messages");
- out.println(" -g or --gui Start the Apache Tika GUI");
- out.println(" -x or --xml Output XHTML content (default)");
- out.println(" -h or --html Output HTML content");
- out.println(" -t or --text Output plain text content");
- out.println(" -m or --metadata Output only metadata");
+ out.println(" -? or --help Print this usage message");
+ out.println(" -v or --verbose Print debug level messages");
+ out.println(" -g or --gui Start the Apache Tika GUI");
+ out.println(" -eX or --encoding=X Use output encoding X");
+ out.println(" -x or --xml Output XHTML content (default)");
+ out.println(" -h or --html Output HTML content");
+ out.println(" -t or --text Output plain text content");
+ out.println(" -m or --metadata Output only metadata");
out.println();
out.println("Description:");
out.println(" Apache Tika will parse the file(s) specified on the");
@@ -165,6 +179,10 @@
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
+ if (encoding != null) {
+ handler.getTransformer().setOutputProperty(
+ OutputKeys.ENCODING, encoding);
+ }
handler.setResult(new StreamResult(System.out));
return handler;
}
@@ -176,24 +194,47 @@
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
+ if (encoding != null) {
+ handler.getTransformer().setOutputProperty(
+ OutputKeys.ENCODING, encoding);
+ }
handler.setResult(new StreamResult(System.out));
return handler;
}
- private ContentHandler getTextContentHandler() {
- return new BodyContentHandler(System.out);
+ private ContentHandler getTextContentHandler()
+ throws UnsupportedEncodingException {
+ return new BodyContentHandler(getSystemOutWriter());
}
- private ContentHandler getMetadataContentHandler() {
+ private ContentHandler getMetadataContentHandler()
+ throws UnsupportedEncodingException {
+ final PrintWriter writer = new PrintWriter(getSystemOutWriter());
return new DefaultHandler() {
public void endDocument() {
String[] names = metadata.names();
Arrays.sort(names);
for (String name : names) {
- System.out.println(name + ": " + metadata.get(name));
+ writer.println(name + ": " + metadata.get(name));
}
}
};
}
+ /**
+ * Returns a {...@link System#out} writer with the configured output
encoding.
+ *
+ * @see <a
href="https://issues.apache.org/jira/browse/TIKA-277">TIKA-277</a>
+ * @return writer
+ * @throws UnsupportedEncodingException
+ * if the configured encoding is not supported
+ */
+ private Writer getSystemOutWriter() throws UnsupportedEncodingException {
+ if (encoding != null) {
+ return new OutputStreamWriter(System.out, encoding);
+ } else {
+ return new OutputStreamWriter(System.out);
+ }
+ }
+
}