
import java.io.*;

import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;


/**
 * <p>A parser for html. Parses the html, extracts all text, and
 * discards everything else such as comments, tags etc.</p>
 * <p>This parser is not dependent of any external packages - it 
 * relies on the swing components found in the JDK</p> 
 * @version $Revision: 1.1 $, $Date: 2002/11/18 08:27:36 $
 * @author  Ronnie Kolehmainen
 * @see     HtmlDocument
 */
public class HtmlParser {

    /* Summary max length */
    public static final int SUMMARY_LENGTH = 200;

    /* Private members */
    private Reader _reader;
    private HTMLEditorKit.ParserCallback _doc;

    /* Accessed by the HtmlDocument class */
    protected String [] ignoredtags = null;
    protected Writer    textwriter  = null;



    /**
     * Constructor
     * @param in the HTML as an InputStream
     * @throws IOException if an I/O error occurs
     */
    public HtmlParser(InputStream in) throws IOException
    {
	_reader    = new InputStreamReader(in);
	textwriter = new StringWriter();
    }


    /**
     * Constructor
     * @param in the HTML as a File object
     * @throws IOException if an I/O error occurs
     */
    public HtmlParser(File in) throws IOException
    {
	_reader    = new FileReader(in);
	textwriter = new StringWriter();
    }



    /**
     * Parses the contents from the reader and stores the data (text)
     * for later retrieval. All streams and writers are flushed and closed
     * when done.
     * @return true on success, false on any type of exception
     */
    public boolean parse()
    {
	_doc = new HtmlDocument(this);
        
	try{
            ParserDelegator parser = new ParserDelegator();
            parser.parse(_reader, _doc, true);
	    _doc.flush();
	    textwriter.flush();
	    textwriter.close();
        }
        catch(Exception e){
            return false;
        }
	finally {
	    try {
		_reader.close();
	    } catch(IOException ioe) {
		return false;
	    }
	}
	return true;
    }



    /**
     * Use this method to explicitly set which tags contents should be ignored.
     * @param ignoredtags the list of tag names excluding brackets
     */
    public void setIgnoreTags(String [] ignoredtags)
    {
	this.ignoredtags = ignoredtags;
    }


    /**
     * Returns all text from the html. Note: will return empty string if not parsed.
     * @return the text as String
     */
    public String getText()
    {
	return textwriter.toString();
    }


    /**
     * Returns the summary, i e the 200 first chars of the text or less
     * Note: will return empty string if not parsed.
     * @return the summary text
     */
    public String getSummary()
    {
	if (((StringWriter)textwriter).getBuffer().length() >= SUMMARY_LENGTH) {
	    return textwriter.toString().substring(0, SUMMARY_LENGTH);
	} else {
	    return textwriter.toString();
	}
    }


    /**
     * Test method. Takes file to parse as argument and writes all text
     * a file &quot;html.txt&quot;
     * @param args[0] the filename of the file to parse
     */
    public static void main(String args[]){
	
	try{
            File f = new File(args[0]);
            HtmlParser doc = new HtmlParser(f);
	    doc.parse();
	    FileWriter w = new FileWriter(new File("html.txt"));
	    w.write("Summary:\n" + doc.getSummary() + "\n\n");
	    w.write("Contents:\n" + doc.getText() + "\n");
	    w.close();
        }
        catch(Exception e){
            e.printStackTrace();
        }

    }

}



