Author: nick Date: Mon Aug 10 06:24:57 2015 New Revision: 1694961 URL: http://svn.apache.org/r1694961 Log: Several people on StackOverflow are getting confused by this example, show how to use AutoDetectParser first, all the components second
Modified: tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java Modified: tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java?rev=1694961&r1=1694960&r2=1694961&view=diff ============================================================================== --- tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java (original) +++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java Mon Aug 10 06:24:57 2015 @@ -19,11 +19,13 @@ import java.io.File; import org.apache.commons.io.FileUtils; import org.apache.tika.config.TikaConfig; import org.apache.tika.detect.Detector; +import org.apache.tika.io.TikaInputStream; import org.apache.tika.language.LanguageIdentifier; import org.apache.tika.language.LanguageProfile; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MimeTypes; +import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; @@ -33,14 +35,45 @@ import org.xml.sax.ContentHandler; * Demonstrates how to call the different components within Tika: its * {@link Detector} framework (aka MIME identification and repository), its * {@link Parser} interface, its {@link LanguageIdentifier} and other goodies. + * It also shows the "easy way" via {@link AutoDetectParser} */ @SuppressWarnings("deprecation") public class MyFirstTika { public static void main(String[] args) throws Exception { String filename = args[0]; - MimeTypes mimeRegistry = TikaConfig.getDefaultConfig() - .getMimeRepository(); + TikaConfig tikaConfig = TikaConfig.getDefaultConfig(); + + Metadata metadata = new Metadata(); + String text = parseUsingComponents(filename, tikaConfig, metadata); + System.out.println("Parsed Metadata: "); + System.out.println(metadata); + System.out.println("Parsed Text: "); + System.out.println(text); + + System.out.println("-------------------------"); + + metadata = new Metadata(); + text = parseUsingAutoDetect(filename, tikaConfig, metadata); + System.out.println("Parsed Metadata: "); + System.out.println(metadata); + System.out.println("Parsed Text: "); + System.out.println(text); + } + + public static String parseUsingAutoDetect(String filename, TikaConfig tikaConfig, + Metadata metadata) throws Exception { + System.out.println("Handling using AutoDetectParser: [" + filename + "]"); + + AutoDetectParser parser = new AutoDetectParser(tikaConfig); + ContentHandler handler = new BodyContentHandler(); + TikaInputStream stream = TikaInputStream.get(new File(filename)); + parser.parse(stream, handler, metadata, new ParseContext()); + return handler.toString(); + } + public static String parseUsingComponents(String filename, TikaConfig tikaConfig, + Metadata metadata) throws Exception { + MimeTypes mimeRegistry = tikaConfig.getMimeRepository(); System.out.println("Examining: [" + filename + "]"); @@ -51,8 +84,7 @@ public class MyFirstTika { + mimeRegistry.getMimeType(new File(filename)) + "]"); Detector mimeDetector = (Detector) mimeRegistry; - System.out - .println("The MIME type (based on the Detector interface) is: [" + System.out.println("The MIME type (based on the Detector interface) is: [" + mimeDetector.detect(new File(filename).toURI().toURL() .openStream(), new Metadata()) + "]"); @@ -62,16 +94,12 @@ public class MyFirstTika { System.out.println("The language of this content is: [" + lang.getLanguage() + "]"); - Parser parser = TikaConfig.getDefaultConfig().getParser( + Parser parser = tikaConfig.getParser( MediaType.parse(mimeRegistry.getMimeType(filename).getName())); - Metadata parsedMet = new Metadata(); ContentHandler handler = new BodyContentHandler(); parser.parse(new File(filename).toURI().toURL().openStream(), handler, - parsedMet, new ParseContext()); - - System.out.println("Parsed Metadata: "); - System.out.println(parsedMet); - System.out.println("Parsed Text: "); - System.out.println(handler.toString()); + metadata, new ParseContext()); + + return handler.toString(); } }