david.stu...@progressivealliance.co.uk
Thu, 03 Dec 2009 11:01:26 -0800
Hi, I am trying to implement a really simple example of the tika html to xhtml parser here is what I have so far. I have tried two different implementations but neiter seem to produce any results. I have included the sample html as well.
Thanks for your help
regards,
dave
import static java.lang.System.err;
import static java.lang.System.exit;
import static java.lang.System.out;
import java.lang.Class;
import java.io.IOException;
import java.io.InputStream;
import java.io.FileInputStream;
import java.io.File;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
import org.apache.tika.Tika;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.SAXException;
public class TikaParseHtml {
public static void main(String args[]) throws IOException, SAXException{
if (args.length < 1) {
err.println("USAGE: TikaParseHtml <resourceUrl>");
exit(1);
}
try {
parse(args[0]);
} catch (Throwable t) {
err.println("Could not parse document:" + t.getClass() + ":" +
t.getMessage());
t.printStackTrace(err);
}
}
private static void parse(String resourceLocation) throws
IOException,SAXException {
try {
System.out.println(resourceLocation);
File f = new File(resourceLocation);
Class c = null;
try {
c = Class.forName("TikaParseHtml");
} catch (Exception ex) {
// This should not happen.
}
InputStream input = c.getResourceAsStream(resourceLocation);
Metadata metadata = new Metadata();
BodyContentHandler handler = new BodyContentHandler();
AutoDetectParser parser = new AutoDetectParser();
parser.parse(input, handler, metadata);
String content = new Tika().parseToString(f);
System.out.println("Content: " + content);
System.out.println("Content2: " + handler.toString());
System.out.println("Title: " + metadata.get(Metadata.TITLE));
System.out.println(metadata.get(Metadata.CONTENT_TYPE));
}
catch (Exception e) {
e.printStackTrace();
}
}
}
Test html
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta name="DC.title" lang="en" content="The Strategies" />
<title>Dave TEst</title>
</head>
<body>
<p>THe quick brown fox</p></body>
</html>