Hi,

I am trying to implement a really simple example of the tika html to xhtml
parser here is what I have so far. I have tried two different implementations
but neiter seem to produce any results. I have included the sample html as well.

Thanks for your help

regards,


dave

import static java.lang.System.err;
import static java.lang.System.exit;
import static java.lang.System.out;
import java.lang.Class;

import java.io.IOException;
import java.io.InputStream;
import java.io.FileInputStream;
import java.io.File;

import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
import org.apache.tika.Tika;
import org.apache.tika.sax.BodyContentHandler;

import org.xml.sax.SAXException;

public class TikaParseHtml {

    public static void main(String args[]) throws IOException, SAXException{
        if (args.length < 1) {
            err.println("USAGE: TikaParseHtml <resourceUrl>");
            exit(1);
        }

        try {
            parse(args[0]);
        } catch (Throwable t) {
            err.println("Could not parse document:" + t.getClass() + ":" +
t.getMessage());
            t.printStackTrace(err);
        }
    }

    private static void parse(String resourceLocation) throws
IOException,SAXException {
        try {
            System.out.println(resourceLocation);
            File f = new File(resourceLocation);
            Class c = null;
            try {
                c = Class.forName("TikaParseHtml");
            } catch (Exception ex) {
                // This should not happen.
            }
        
            InputStream input = c.getResourceAsStream(resourceLocation);
            Metadata metadata = new Metadata();
    
            BodyContentHandler handler = new BodyContentHandler();
            AutoDetectParser parser = new AutoDetectParser();
            parser.parse(input, handler, metadata);
    
            String content = new Tika().parseToString(f);
            System.out.println("Content: " + content);    
            System.out.println("Content2: " + handler.toString());
            System.out.println("Title: " + metadata.get(Metadata.TITLE));
            System.out.println(metadata.get(Metadata.CONTENT_TYPE));
        }
        catch (Exception e) {
            e.printStackTrace();
        }

    }
}
Test html

<html xmlns="http://www.w3.org/1999/xhtml";>
<head>
 <meta name="DC.title" lang="en" content="The Strategies" />
 <title>Dave TEst</title>
</head>
<body>
 <p>THe quick brown fox</p></body>
</html>

Reply via email to