Author: jukka
Date: Sun Sep 20 22:59:48 2009
New Revision: 817118

URL: http://svn.apache.org/viewvc?rev=817118&view=rev
Log:
TIKA-277: Tika stand alone CLI --possibility to specify output encoding (--text)

Make the order of command line arguments less important.

Modified:
    lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java

Modified: 
lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=817118&r1=817117&r2=817118&view=diff
==============================================================================
--- lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java 
(original)
+++ lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java 
Sun Sep 20 22:59:48 2009
@@ -67,13 +67,51 @@
         }
     }
 
+    private interface OutputType {
+        ContentHandler getContentHandler() throws Exception;
+    }
+
+    private final OutputType XML = new OutputType() {
+        public ContentHandler getContentHandler() throws Exception {
+            return getTransformerHandler("xml", encoding);
+        }
+    };
+
+    private final OutputType HTML = new OutputType() {
+        public ContentHandler getContentHandler() throws Exception {
+            return getTransformerHandler("html", encoding);
+        }
+    };
+
+    private final OutputType TEXT = new OutputType() {
+        public ContentHandler getContentHandler() throws Exception {
+            return new BodyContentHandler(getSystemOutWriter(encoding));
+        }
+    };
+
+    private final OutputType METADATA = new OutputType() {
+        public ContentHandler getContentHandler() throws Exception {
+            final PrintWriter writer =
+                new PrintWriter(getSystemOutWriter(encoding));
+            return new DefaultHandler() {
+                public void endDocument() {
+                    String[] names = metadata.names();
+                    Arrays.sort(names);
+                    for (String name : names) {
+                        writer.println(name + ": " + metadata.get(name));
+                    }
+                }
+            };
+        }
+    };
+
     private Map<String, Object> context;
 
     private Parser parser;
 
     private Metadata metadata;
 
-    private ContentHandler handler;
+    private OutputType type = XML;
 
     /**
      * Output character encoding, or <code>null</code> for platform default
@@ -86,7 +124,6 @@
         context = new HashMap<String, Object>();
         parser = new AutoDetectParser();
         context.put(Parser.class.getName(), parser);
-        handler = getXmlContentHandler();
     }
 
     public void process(String arg) throws Exception {
@@ -103,18 +140,20 @@
         } else if (arg.startsWith("--encoding=")) {
             encoding = arg.substring("--encoding=".length());
         } else if (arg.equals("-x") || arg.equals("--xml")) {
-            handler = getXmlContentHandler();
+            type = XML;
         } else if (arg.equals("-h") || arg.equals("--html")) {
-            handler = getHtmlContentHandler();
+            type = HTML;
         } else if (arg.equals("-t") || arg.equals("--text")) {
-            handler = getTextContentHandler();
+            type = TEXT;
         } else if (arg.equals("-m") || arg.equals("--metadata")) {
-            handler = getMetadataContentHandler();
+            type = METADATA;
         } else {
             pipeMode = false;
             metadata = new Metadata();
             if (arg.equals("-")) {
-                parser.parse(System.in, handler, metadata, context);
+                parser.parse(
+                        System.in, type.getContentHandler(),
+                        metadata, context);
             } else {
                 InputStream input;
                 File file = new File(arg);
@@ -132,7 +171,9 @@
                     input = url.openStream();
                 }
                 try {
-                    parser.parse(input, handler, metadata, context);
+                    parser.parse(
+                            System.in, type.getContentHandler(),
+                            metadata, context);
                 } finally {
                     input.close();
                 }
@@ -172,27 +213,45 @@
         out.println("    extract text content and metadata from the files.");
     }
 
-    private ContentHandler getXmlContentHandler()
-            throws TransformerConfigurationException {
-        SAXTransformerFactory factory = (SAXTransformerFactory)
-            SAXTransformerFactory.newInstance();
-        TransformerHandler handler = factory.newTransformerHandler();
-        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
-        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
+    /**
+     * Returns a {...@link System#out} writer with the given output encoding.
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-277";>TIKA-277</a>
+     * @param encoding output encoding,
+     *                 or <code>null</code> for the platform default
+     * @return {...@link System#out} writer
+     * @throws UnsupportedEncodingException
+     *         if the configured encoding is not supported
+     */
+    private static Writer getSystemOutWriter(String encoding)
+            throws UnsupportedEncodingException {
         if (encoding != null) {
-            handler.getTransformer().setOutputProperty(
-                    OutputKeys.ENCODING, encoding);
+            return new OutputStreamWriter(System.out, encoding);
+        } else {
+            return new OutputStreamWriter(System.out);
         }
-        handler.setResult(new StreamResult(System.out));
-        return handler;
     }
 
-    private ContentHandler getHtmlContentHandler()
+    /**
+     * Returns a transformer handler that serializes incoming SAX events
+     * to XHTML or HTML (depending the given method) using the given output
+     * encoding.
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-277";>TIKA-277</a>
+     * @param method "xml" or "html"
+     * @param encoding output encoding,
+     *                 or <code>null</code> for the platform default
+     * @return {...@link System#out} transformer handler
+     * @throws TransformerConfigurationException
+     *         if the transformer can not be created
+     */
+    private static TransformerHandler getTransformerHandler(
+            String method, String encoding)
             throws TransformerConfigurationException {
         SAXTransformerFactory factory = (SAXTransformerFactory)
-        SAXTransformerFactory.newInstance();
+                SAXTransformerFactory.newInstance();
         TransformerHandler handler = factory.newTransformerHandler();
-        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
+        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, method);
         handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
         if (encoding != null) {
             handler.getTransformer().setOutputProperty(
@@ -202,39 +261,4 @@
         return handler;
     }
 
-    private ContentHandler getTextContentHandler()
-            throws UnsupportedEncodingException {
-        return new BodyContentHandler(getSystemOutWriter());
-    }
-
-    private ContentHandler getMetadataContentHandler()
-            throws UnsupportedEncodingException {
-        final PrintWriter writer = new PrintWriter(getSystemOutWriter());
-        return new DefaultHandler() {
-            public void endDocument() {
-                String[] names = metadata.names();
-                Arrays.sort(names);
-                for (String name : names) {
-                    writer.println(name + ": " + metadata.get(name));
-                }
-            }
-        };
-    }
-
-    /**
-     * Returns a {...@link System#out} writer with the configured output 
encoding.
-     *
-     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-277";>TIKA-277</a>
-     * @return writer
-     * @throws UnsupportedEncodingException
-     *         if the configured encoding is not supported
-     */
-    private Writer getSystemOutWriter() throws UnsupportedEncodingException {
-        if (encoding != null) {
-            return new OutputStreamWriter(System.out, encoding);
-        } else {
-            return new OutputStreamWriter(System.out);
-        }
-    }
-
 }


Reply via email to