
import org.cyberneko.html.HTMLConfiguration;
import org.cyberneko.html.HTMLElements;
import org.cyberneko.html.filters.DefaultFilter;

import org.apache.xerces.util.XMLAttributesImpl;
import org.apache.xerces.xni.Augmentations;
import org.apache.xerces.xni.QName;
import org.apache.xerces.xni.XMLAttributes;
import org.apache.xerces.xni.XMLLocator;
import org.apache.xerces.xni.XMLResourceIdentifier;
import org.apache.xerces.xni.XMLString;
import org.apache.xerces.xni.XNIException;
import org.apache.xerces.xni.parser.XMLDocumentFilter;
import org.apache.xerces.xni.parser.XMLInputSource;
import org.apache.xerces.xni.parser.XMLParserConfiguration;

import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.BufferedReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.io.IOException;
import java.util.Set;

/**
 * An HTML Parser written as a filter. Besides serializing the HTML
 * event stream, the writer also passes the document events to the next
 * stage in the pipeline. This allows applications to insert the parser
 * filters between other custom filters.
 * <p>
 * Since an HTML document may have specified its encoding using the
 * &lt;META&gt; tag and http-equiv/content attributes, the writer will
 * automatically change any character set specified in this tag to
 * match the encoding of the output stream. Therefore, the character
 * encoding name used to construct the writer should be an official
 * <a href='http://www.iana.org/assignments/character-sets'>IANA</a>
 * encoding name and not a Java encoding name.
 * <p>
 * <strong>Note:</strong>
 * The modified character set in the &lt;META&gt; tag is <em>not</em>
 * propagated to the next stage in the pipeline. The changed value is
 * only output to the stream; the original value is sent to the next
 * stage in the pipeline.
 *
 * @author Otis Gospodnetic
 * @version $Revision: $
 */
public class NekoHTMLParser
    extends DefaultFilter
    implements WebDocumentParser
{
    //
    // Constants
    //

    /** Notify character entity references. */
    public static final String NOTIFY_CHAR_REFS =
	"http://apache.org/xml/features/scanner/notify-char-refs";

    /** Notify built-in entity references. */
    public static final String NOTIFY_HTML_BUILTIN_REFS =
	"http://cyberneko.org/html/features/scanner/notify-builtin-refs";

    /** Augmentations feature identifier. */
    protected static final String AUGMENTATIONS =
	"http://cyberneko.org/html/features/augmentations";

    /** Filters property identifier. */
    protected static final String FILTERS =
	"http://cyberneko.org/html/properties/filters";

    protected static final String BLANK_SPACE = " ";

    //
    // Data
    //

    XMLParserConfiguration mParser;

    /** The encoding. */
    protected String mEncoding;

    /**
     * The print writer used for serializing the document with the
     * appropriate character encoding.
     */
    protected PrintWriter mPrinter;

    /** HTML page title text */
    protected String       mTitle;
    /** HTML page meta keywords text */
    protected String       mMetaKeywords;
    /** HTML page meta description text */
    protected String       mMetaDescription;
    /** HTML page body text */
    protected StringBuffer mBody = new StringBuffer();

    /** A boolean that is true when we are parsing the title tag */
    protected boolean      mIsTitle;
    /** A boolean that is true when we are parsing the meta keywords tag */
    protected boolean      mIsMetaKeywords;
    /** A boolean that is true when we are parsing the meta description tag */
    protected boolean      mIsMetaDescription;
    /**
     * A boolean that is true when we are parsing the body tag or any of it's
     * sub-elements
     */
    protected boolean      mIsBody;

    // state

    /** Seen root element. */
    protected boolean mSeenRootElement;

    /** Seen http-equiv directive. */
    protected boolean mSeenHttpEquiv;

    /** Element depth. */
    protected int mElementDepth;

    /** Normalize character content. */
    protected boolean mNormalize;

    /** Print characters. */
    protected boolean mPrintChars;

    //
    // Constructors
    //

    /**
     * Constructs a parser filter that collects data in instance attributes.
     */
    public NekoHTMLParser()
    {
	mEncoding = "UTF-8";
        mParser = new HTMLConfiguration();
        mParser.setFeature(NOTIFY_CHAR_REFS, true);
        mParser.setFeature(NOTIFY_HTML_BUILTIN_REFS, true);
	XMLDocumentFilter[] filters = new XMLDocumentFilter[] { this };
	mParser.setProperty(FILTERS, filters);
    }

    /**
     * Constructs a writer filter using the specified output stream and
     * encoding.
     *
     * @param outputStream The output stream to write to.
     * @param encoding The encoding to be used for the output. The encoding name
     *                 should be an official IANA encoding name.
     */
    public NekoHTMLParser(OutputStream outputStream, String encoding)
        throws UnsupportedEncodingException
    {
        mEncoding = encoding;
	mPrinter  = new PrintWriter(new OutputStreamWriter(outputStream, mEncoding));
    }

    //-------------------------------------------------------------------------
    // Public methods
    //-------------------------------------------------------------------------

    /**
     * Sets the data input.
     *
     * @param is InputStream containing HTML data
     * @throws IOException if there is an I/O problem
     */
    public void setInput(InputStream is)
	throws IOException
    {
	mParser.parse(
	    new XMLInputSource(
		null, null, null,
		new BufferedReader(new InputStreamReader(is)), null));
    }

    /**
     * Gets the title attribute of the <code>NekoHTMLDOMParser</code> object.
     *
     * @return the HTML-free text of web page title as String
     */
    public String getTitle()
    {
	return mTitle;
    }

    /**
     * Gets the bodyText attribute of the <code>NekoHTMLDOMParser</code> object.
     *
     * @return the HTML-free text of web page body as String
     */
    public String getBody()
    {
	return mBody.toString();
    }

    /**
     *
     * @return
     */
    public Set getUniqueHREFLinks()
    {
	return null;
    }

    /**
     *
     * @return
     */
    public Set getUniqueIMGLink()
    {
	return null;
    }

    //-------------------------------------------------------------------------
    // Public methods that implement XMLDocumentHandler interface
    //-------------------------------------------------------------------------

    /** Start document. */
    public void startDocument(XMLLocator locator, String encoding, Augmentations augs)
        throws XNIException
    {
	// reset all variables
	mTitle             = null;
	mMetaKeywords      = null;
	mMetaDescription   = null;
	mBody.setLength(0);
	mIsTitle           = false;
	mIsMetaKeywords    = false;
	mIsMetaDescription = false;
	mIsBody            = false;
        mSeenRootElement   = false;
        mSeenHttpEquiv     = false;
        mElementDepth      = 0;
        mNormalize         = true;
        mPrintChars        = true;
        super.startDocument(locator, encoding, augs);
    }

    /** Comment. */
//     public void comment(XMLString text, Augmentations augs)
//         throws XNIException {
//         if (mSeenRootElement && mElementDepth <= 0) {
//             mPrinter.println();
//         }
//         mPrinter.print("<!--");
//         printCharacters(text, false);
//         mPrinter.print("-->");
//         if (!mSeenRootElement) {
//             mPrinter.println();
//         }
//         mPrinter.flush();
//     }

    /** Start element. */
    public void startElement(QName element, XMLAttributes attributes, Augmentations augs)
        throws XNIException
    {
        mSeenRootElement = true;
        mElementDepth++;
        mNormalize = !HTMLElements.getElement(element.rawname).isSpecial();

	// figure out which element we just started
	String rawName = element.rawname.toUpperCase();
	if ("TITLE".equals(rawName))
	{
	    mIsTitle = true;
	    mIsMetaKeywords = false;
	    mIsMetaDescription = false;
	    mIsBody = false;
	}
	else if ("META".equals(rawName))
	{
	    mIsTitle = false;
	    mIsBody = false;
	    if (attributes.getValue("keywords") != null)
	    {
		mIsMetaKeywords = true;
		mIsMetaDescription = false;
	    }
	    else if (attributes.getValue("description") != null)
	    {
		mIsMetaKeywords = false;
		mIsMetaDescription = true;
	    }
	    else
	    {
		mIsMetaKeywords = false;
		mIsMetaDescription = false;
	    }
	}
	else
	{
	    mIsBody = true;
	    mIsTitle = false;
	    mIsMetaKeywords = false;
	    mIsMetaDescription = false;
	}

	//printStartElement(element, attributes);
        super.startElement(element, attributes, augs);
    }

    /** Empty element. */
    public void emptyElement(QName element, XMLAttributes attributes, Augmentations augs)
        throws XNIException
    {
        mSeenRootElement = true;
        //printStartElement(element, attributes);
        super.emptyElement(element, attributes, augs);
    }

    /** Characters. */
    public void characters(XMLString text, Augmentations augs)
        throws XNIException
    {
        if (mPrintChars)
	{
	    if (mIsBody)
	    {
		mBody.append(text.toString() + BLANK_SPACE);
		return;
	    }
	    else if (mIsTitle)
	    {
		mTitle = text.toString();
		return;
	    }
	    else if (mIsMetaKeywords)
	    {
		// FIXME: this looks wrong.  I need to get the text from KEYWORDS attrib
		mMetaKeywords = text.toString();
		return;
	    }
	    else if (mIsMetaDescription)
	    {
		// FIXME: this looks wrong.  I need to get the text from DESCRIPTION attrib
		mMetaDescription = text.toString();
		return;
	    }

	    //printCharacters(text, mNormalize);
        }
        super.characters(text, augs);
    }

    /** End element. */
    public void endElement(QName element, Augmentations augs)
        throws XNIException
    {
        mElementDepth--;
        mNormalize = true;
        /**
        // NOTE: Not sure if this is waht should be done in the case where
        //       the encoding is not explitly declared within the HEAD. So
        //       I'm leaving it commented out for now. -Ac
        if (element.rawname.equalsIgnoreCase("head") && !mSeenHttpEquiv) {
            boolean capitalize = Character.isUpperCase(element.rawname.charAt(0));
            String ename = capitalize ? "META" : "meta";
            QName qname = new QName(null, ename, ename, null);
            XMLAttributes attrs = new XMLAttributesImpl();
            QName aname = new QName(null, "http-equiv", "http-equiv", null);
            attrs.addAttribute(aname, "CDATA", "Content-Type");
            aname.setValues(null, "content", "content", null);
            attrs.addAttribute(aname, "CDATA", "text/html; charset="+mEncoding);
            super.emptyElement(qname, attrs, null);
        }
        **/

	mIsTitle = false;
	mIsBody = false;
	mIsMetaKeywords = false;
	mIsMetaDescription = false;

        //printEndElement(element);
        super.endElement(element, augs);
    }

    /** Start general entity. */
    public void startGeneralEntity(String name, XMLResourceIdentifier id,
	String encoding, Augmentations augs)
        throws XNIException
    {
        mPrintChars = false;
        //printEntity(name);
        super.startGeneralEntity(name, id, encoding, augs);
    }

    /** End general entity. */
    public void endGeneralEntity(String name, Augmentations augs)
        throws XNIException
    {
        mPrintChars = true;
        super.endGeneralEntity(name, augs);
    }

    //-------------------------------------------------------------------------
    // Protected methods
    //-------------------------------------------------------------------------

    /** Print attribute value. */
    protected void printAttributeValue(String text)
    {
	int length = text.length();
	for (int j = 0; j < length; j++)
	{
	    char c = text.charAt(j);
	    if (c == '"')
	    {
		//mPrinter.print("&quot;");
		System.out.println("&quot;");
	    }
	    else
	    {
		//mPrinter.print(c);
		System.out.println(c);
	    }
	}
	mPrinter.flush();
     }

    /** Print characters. */
    protected void printCharacters(XMLString text, boolean normalize)
    {
        if (normalize)
	{
            for (int i = 0; i < text.length; i++)
	    {
                char c = text.ch[text.offset + i];
                if (c != '\n')
		{
                    String entity = entity(c);
                    if (entity != null)
		    {
                        printEntity(entity);
                    }
                    else
		    {
			//mPrinter.print(c);
			System.out.println(c);
                    }
                }
		else
		{
		    //mPrinter.println();
		    System.out.println();
		}
            }
        }
        else
	{
            for (int i = 0; i < text.length; i++)
	    {
                char c = text.ch[text.offset + i];
 		//mPrinter.print(c);
		System.out.println(c);
            }
        }
	//mPrinter.flush();
    }

    /** Print start element. */
    protected void printStartElement(QName element, XMLAttributes attributes)
    {
	System.out.println("START ELEMENT: " + element.rawname.toUpperCase());

        // modify META[@http-equiv='content-type']/@content value
        int contentIndex = -1;
        String originalContent = null;
        if (element.rawname.toLowerCase().equals("meta"))
	{
            String httpEquiv = null;
            int length = attributes.getLength();
            for (int i = 0; i < length; i++)
	    {
                String aname = attributes.getQName(i).toLowerCase();
                if (aname.equals("http-equiv"))
		{
                    httpEquiv = attributes.getValue(i);
                }
                else if (aname.equals("content"))
		{
                    contentIndex = i;
                }
            }
            if (httpEquiv != null && httpEquiv.toLowerCase().equals("content-type"))
	    {
                mSeenHttpEquiv = true;
                String content = null;
                if (contentIndex != -1)
		{
                    originalContent = attributes.getValue(contentIndex);
                    content = originalContent.toLowerCase();
                }
                if (content != null)
		{
                    int charsetIndex = content.indexOf("charset=");
                    if (charsetIndex != -1)
		    {
                        content = content.substring(0, charsetIndex + 8);
                    }
                    else
		    {
                        content += ";charset=";
                    }
                    content += mEncoding;
                    attributes.setValue(contentIndex, content);
                }
            }
        }

        // print element
	mPrinter.print('<');
	mPrinter.print(element.rawname);
        int attrCount = attributes != null ? attributes.getLength() : 0;
        for (int i = 0; i < attrCount; i++)
	{
            String aname = attributes.getQName(i);
            String avalue = attributes.getValue(i);
	    mPrinter.print(' ');
	    mPrinter.print(aname);
	    mPrinter.print("=\"");
	    printAttributeValue(avalue);
	    mPrinter.print('"');
        }
	mPrinter.print('>');
	mPrinter.flush();

        // return original META[@http-equiv]/@content value
        if (contentIndex != -1)
	{
            attributes.setValue(contentIndex, originalContent);
        }
    }

    /** Print end element. */
    protected void printEndElement(QName element)
    {
	mPrinter.print("</");
        mPrinter.print(element.rawname);
	mPrinter.print('>');
	mPrinter.flush();
    }

    /** Print entity. */
    protected void printEntity(String name)
    {
	mPrinter.print('&');
	mPrinter.print(name);
	mPrinter.print(';');
	mPrinter.flush();
    }

    //
    // Private static methods
    //

    // NOTE: These methods are private because I have every intention
    //       of removing them later to be replaced with something that
    //       is designed better. -Ac

    /** Returns the name of the entity for the specified character. */
    private static String entity(char c)
    {
        switch (c)
	{
            case 0x0026: return "amp";
            case 0x003c: return "lt";
            case 0x00a0: return "nbsp";
            case 0x00a9: return "copy";
            case 0x00ae: return "reg";
            case 0x2014: return "mdash";
            case 0x00a7: return "sect";
            case 0x00b7: return "middot";
            case 0x00e9: return "eacute";
            case 0x003e: return "gt";
        }
        return null;
    }

    //
    // MAIN
    //

    /** Main. */
    public static void main(String[] argv)
	throws Exception
    {
        if (argv.length == 0)
	{
            printUsage();
            System.exit(1);
        }

	NekoHTMLParser nekoParser;
        XMLParserConfiguration parser = new HTMLConfiguration();
        parser.setFeature(NOTIFY_CHAR_REFS, true);
        parser.setFeature(NOTIFY_HTML_BUILTIN_REFS, true);
        String encoding = "Windows-1252";
        boolean identity = false;
        for (int i = 0; i < argv.length; i++)
	{
            String arg = argv[i];
            if (arg.equals("-e"))
	    {
                encoding = argv[++i];
                continue;
            }
//             if (arg.equals("-i"))
// 	    {
//                 identity = true;
//                 continue;
//             }
            if (arg.equals("-h"))
	    {
                printUsage();
                System.exit(1);
            }
            XMLDocumentFilter[] filters;
//             if (identity)
// 	    {
//                 parser.setFeature(AUGMENTATIONS, true);
//                 filters = new XMLDocumentFilter[] {
//                     new Identity(),
//                     new NekoHTMLParser(System.out, encoding)
//                 };
//             }
//             else
	    {
		nekoParser = new NekoHTMLParser(System.out, encoding);
		filters = new XMLDocumentFilter[] { nekoParser };
            }
            parser.setProperty(FILTERS, filters);
            parser.parse(new XMLInputSource(null, arg, null));

	    System.out.println("BODY:\n" + nekoParser.mBody);
        }
    }

    /** Print usage. */
    private static void printUsage()
    {
        System.err.println("usage: java " + NekoHTMLParser.class.getName() + " (options) file ...");
        System.err.println();
        System.err.println("options:");
        System.err.println("  -e name  Specify IANA name of output encoding.");
//         System.err.println("  -i       Perform identity transform.");
        System.err.println("  -h       Display help screen.");
    }
}
