Hi, I am trying to implement a really simple example of the tika html to xhtml parser here is what I have so far. I have tried two different implementations but neiter seem to produce any results. I have included the sample html as well.
Thanks for your help regards, dave import static java.lang.System.err; import static java.lang.System.exit; import static java.lang.System.out; import java.lang.Class; import java.io.IOException; import java.io.InputStream; import java.io.FileInputStream; import java.io.File; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.Parser; import org.apache.tika.Tika; import org.apache.tika.sax.BodyContentHandler; import org.xml.sax.SAXException; public class TikaParseHtml { public static void main(String args[]) throws IOException, SAXException{ if (args.length < 1) { err.println("USAGE: TikaParseHtml <resourceUrl>"); exit(1); } try { parse(args[0]); } catch (Throwable t) { err.println("Could not parse document:" + t.getClass() + ":" + t.getMessage()); t.printStackTrace(err); } } private static void parse(String resourceLocation) throws IOException,SAXException { try { System.out.println(resourceLocation); File f = new File(resourceLocation); Class c = null; try { c = Class.forName("TikaParseHtml"); } catch (Exception ex) { // This should not happen. } InputStream input = c.getResourceAsStream(resourceLocation); Metadata metadata = new Metadata(); BodyContentHandler handler = new BodyContentHandler(); AutoDetectParser parser = new AutoDetectParser(); parser.parse(input, handler, metadata); String content = new Tika().parseToString(f); System.out.println("Content: " + content); System.out.println("Content2: " + handler.toString()); System.out.println("Title: " + metadata.get(Metadata.TITLE)); System.out.println(metadata.get(Metadata.CONTENT_TYPE)); } catch (Exception e) { e.printStackTrace(); } } } Test html <html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta name="DC.title" lang="en" content="The Strategies" /> <title>Dave TEst</title> </head> <body> <p>THe quick brown fox</p></body> </html>