import org.cyberneko.html.HTMLConfiguration; import org.cyberneko.html.HTMLElements; import org.cyberneko.html.filters.DefaultFilter; import org.apache.xerces.util.XMLAttributesImpl; import org.apache.xerces.xni.Augmentations; import org.apache.xerces.xni.QName; import org.apache.xerces.xni.XMLAttributes; import org.apache.xerces.xni.XMLLocator; import org.apache.xerces.xni.XMLResourceIdentifier; import org.apache.xerces.xni.XMLString; import org.apache.xerces.xni.XNIException; import org.apache.xerces.xni.parser.XMLDocumentFilter; import org.apache.xerces.xni.parser.XMLInputSource; import org.apache.xerces.xni.parser.XMLParserConfiguration; import java.io.InputStream; import java.io.InputStreamReader; import java.io.BufferedReader; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.io.UnsupportedEncodingException; import java.io.IOException; import java.util.Set; /** * An HTML Parser written as a filter. Besides serializing the HTML * event stream, the writer also passes the document events to the next * stage in the pipeline. This allows applications to insert the parser * filters between other custom filters. *

* Since an HTML document may have specified its encoding using the * <META> tag and http-equiv/content attributes, the writer will * automatically change any character set specified in this tag to * match the encoding of the output stream. Therefore, the character * encoding name used to construct the writer should be an official * IANA * encoding name and not a Java encoding name. *

* Note: * The modified character set in the <META> tag is not * propagated to the next stage in the pipeline. The changed value is * only output to the stream; the original value is sent to the next * stage in the pipeline. * * @author Otis Gospodnetic * @version $Revision: $ */ public class NekoHTMLParser extends DefaultFilter implements WebDocumentParser { // // Constants // /** Notify character entity references. */ public static final String NOTIFY_CHAR_REFS = "http://apache.org/xml/features/scanner/notify-char-refs"; /** Notify built-in entity references. */ public static final String NOTIFY_HTML_BUILTIN_REFS = "http://cyberneko.org/html/features/scanner/notify-builtin-refs"; /** Augmentations feature identifier. */ protected static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations"; /** Filters property identifier. */ protected static final String FILTERS = "http://cyberneko.org/html/properties/filters"; protected static final String BLANK_SPACE = " "; // // Data // XMLParserConfiguration mParser; /** The encoding. */ protected String mEncoding; /** * The print writer used for serializing the document with the * appropriate character encoding. */ protected PrintWriter mPrinter; /** HTML page title text */ protected String mTitle; /** HTML page meta keywords text */ protected String mMetaKeywords; /** HTML page meta description text */ protected String mMetaDescription; /** HTML page body text */ protected StringBuffer mBody = new StringBuffer(); /** A boolean that is true when we are parsing the title tag */ protected boolean mIsTitle; /** A boolean that is true when we are parsing the meta keywords tag */ protected boolean mIsMetaKeywords; /** A boolean that is true when we are parsing the meta description tag */ protected boolean mIsMetaDescription; /** * A boolean that is true when we are parsing the body tag or any of it's * sub-elements */ protected boolean mIsBody; // state /** Seen root element. */ protected boolean mSeenRootElement; /** Seen http-equiv directive. */ protected boolean mSeenHttpEquiv; /** Element depth. */ protected int mElementDepth; /** Normalize character content. */ protected boolean mNormalize; /** Print characters. */ protected boolean mPrintChars; // // Constructors // /** * Constructs a parser filter that collects data in instance attributes. */ public NekoHTMLParser() { mEncoding = "UTF-8"; mParser = new HTMLConfiguration(); mParser.setFeature(NOTIFY_CHAR_REFS, true); mParser.setFeature(NOTIFY_HTML_BUILTIN_REFS, true); XMLDocumentFilter[] filters = new XMLDocumentFilter[] { this }; mParser.setProperty(FILTERS, filters); } /** * Constructs a writer filter using the specified output stream and * encoding. * * @param outputStream The output stream to write to. * @param encoding The encoding to be used for the output. The encoding name * should be an official IANA encoding name. */ public NekoHTMLParser(OutputStream outputStream, String encoding) throws UnsupportedEncodingException { mEncoding = encoding; mPrinter = new PrintWriter(new OutputStreamWriter(outputStream, mEncoding)); } //------------------------------------------------------------------------- // Public methods //------------------------------------------------------------------------- /** * Sets the data input. * * @param is InputStream containing HTML data * @throws IOException if there is an I/O problem */ public void setInput(InputStream is) throws IOException { mParser.parse( new XMLInputSource( null, null, null, new BufferedReader(new InputStreamReader(is)), null)); } /** * Gets the title attribute of the NekoHTMLDOMParser object. * * @return the HTML-free text of web page title as String */ public String getTitle() { return mTitle; } /** * Gets the bodyText attribute of the NekoHTMLDOMParser object. * * @return the HTML-free text of web page body as String */ public String getBody() { return mBody.toString(); } /** * * @return */ public Set getUniqueHREFLinks() { return null; } /** * * @return */ public Set getUniqueIMGLink() { return null; } //------------------------------------------------------------------------- // Public methods that implement XMLDocumentHandler interface //------------------------------------------------------------------------- /** Start document. */ public void startDocument(XMLLocator locator, String encoding, Augmentations augs) throws XNIException { // reset all variables mTitle = null; mMetaKeywords = null; mMetaDescription = null; mBody.setLength(0); mIsTitle = false; mIsMetaKeywords = false; mIsMetaDescription = false; mIsBody = false; mSeenRootElement = false; mSeenHttpEquiv = false; mElementDepth = 0; mNormalize = true; mPrintChars = true; super.startDocument(locator, encoding, augs); } /** Comment. */ // public void comment(XMLString text, Augmentations augs) // throws XNIException { // if (mSeenRootElement && mElementDepth <= 0) { // mPrinter.println(); // } // mPrinter.print(""); // if (!mSeenRootElement) { // mPrinter.println(); // } // mPrinter.flush(); // } /** Start element. */ public void startElement(QName element, XMLAttributes attributes, Augmentations augs) throws XNIException { mSeenRootElement = true; mElementDepth++; mNormalize = !HTMLElements.getElement(element.rawname).isSpecial(); // figure out which element we just started String rawName = element.rawname.toUpperCase(); if ("TITLE".equals(rawName)) { mIsTitle = true; mIsMetaKeywords = false; mIsMetaDescription = false; mIsBody = false; } else if ("META".equals(rawName)) { mIsTitle = false; mIsBody = false; if (attributes.getValue("keywords") != null) { mIsMetaKeywords = true; mIsMetaDescription = false; } else if (attributes.getValue("description") != null) { mIsMetaKeywords = false; mIsMetaDescription = true; } else { mIsMetaKeywords = false; mIsMetaDescription = false; } } else { mIsBody = true; mIsTitle = false; mIsMetaKeywords = false; mIsMetaDescription = false; } //printStartElement(element, attributes); super.startElement(element, attributes, augs); } /** Empty element. */ public void emptyElement(QName element, XMLAttributes attributes, Augmentations augs) throws XNIException { mSeenRootElement = true; //printStartElement(element, attributes); super.emptyElement(element, attributes, augs); } /** Characters. */ public void characters(XMLString text, Augmentations augs) throws XNIException { if (mPrintChars) { if (mIsBody) { mBody.append(text.toString() + BLANK_SPACE); return; } else if (mIsTitle) { mTitle = text.toString(); return; } else if (mIsMetaKeywords) { // FIXME: this looks wrong. I need to get the text from KEYWORDS attrib mMetaKeywords = text.toString(); return; } else if (mIsMetaDescription) { // FIXME: this looks wrong. I need to get the text from DESCRIPTION attrib mMetaDescription = text.toString(); return; } //printCharacters(text, mNormalize); } super.characters(text, augs); } /** End element. */ public void endElement(QName element, Augmentations augs) throws XNIException { mElementDepth--; mNormalize = true; /** // NOTE: Not sure if this is waht should be done in the case where // the encoding is not explitly declared within the HEAD. So // I'm leaving it commented out for now. -Ac if (element.rawname.equalsIgnoreCase("head") && !mSeenHttpEquiv) { boolean capitalize = Character.isUpperCase(element.rawname.charAt(0)); String ename = capitalize ? "META" : "meta"; QName qname = new QName(null, ename, ename, null); XMLAttributes attrs = new XMLAttributesImpl(); QName aname = new QName(null, "http-equiv", "http-equiv", null); attrs.addAttribute(aname, "CDATA", "Content-Type"); aname.setValues(null, "content", "content", null); attrs.addAttribute(aname, "CDATA", "text/html; charset="+mEncoding); super.emptyElement(qname, attrs, null); } **/ mIsTitle = false; mIsBody = false; mIsMetaKeywords = false; mIsMetaDescription = false; //printEndElement(element); super.endElement(element, augs); } /** Start general entity. */ public void startGeneralEntity(String name, XMLResourceIdentifier id, String encoding, Augmentations augs) throws XNIException { mPrintChars = false; //printEntity(name); super.startGeneralEntity(name, id, encoding, augs); } /** End general entity. */ public void endGeneralEntity(String name, Augmentations augs) throws XNIException { mPrintChars = true; super.endGeneralEntity(name, augs); } //------------------------------------------------------------------------- // Protected methods //------------------------------------------------------------------------- /** Print attribute value. */ protected void printAttributeValue(String text) { int length = text.length(); for (int j = 0; j < length; j++) { char c = text.charAt(j); if (c == '"') { //mPrinter.print("""); System.out.println("""); } else { //mPrinter.print(c); System.out.println(c); } } mPrinter.flush(); } /** Print characters. */ protected void printCharacters(XMLString text, boolean normalize) { if (normalize) { for (int i = 0; i < text.length; i++) { char c = text.ch[text.offset + i]; if (c != '\n') { String entity = entity(c); if (entity != null) { printEntity(entity); } else { //mPrinter.print(c); System.out.println(c); } } else { //mPrinter.println(); System.out.println(); } } } else { for (int i = 0; i < text.length; i++) { char c = text.ch[text.offset + i]; //mPrinter.print(c); System.out.println(c); } } //mPrinter.flush(); } /** Print start element. */ protected void printStartElement(QName element, XMLAttributes attributes) { System.out.println("START ELEMENT: " + element.rawname.toUpperCase()); // modify META[@http-equiv='content-type']/@content value int contentIndex = -1; String originalContent = null; if (element.rawname.toLowerCase().equals("meta")) { String httpEquiv = null; int length = attributes.getLength(); for (int i = 0; i < length; i++) { String aname = attributes.getQName(i).toLowerCase(); if (aname.equals("http-equiv")) { httpEquiv = attributes.getValue(i); } else if (aname.equals("content")) { contentIndex = i; } } if (httpEquiv != null && httpEquiv.toLowerCase().equals("content-type")) { mSeenHttpEquiv = true; String content = null; if (contentIndex != -1) { originalContent = attributes.getValue(contentIndex); content = originalContent.toLowerCase(); } if (content != null) { int charsetIndex = content.indexOf("charset="); if (charsetIndex != -1) { content = content.substring(0, charsetIndex + 8); } else { content += ";charset="; } content += mEncoding; attributes.setValue(contentIndex, content); } } } // print element mPrinter.print('<'); mPrinter.print(element.rawname); int attrCount = attributes != null ? attributes.getLength() : 0; for (int i = 0; i < attrCount; i++) { String aname = attributes.getQName(i); String avalue = attributes.getValue(i); mPrinter.print(' '); mPrinter.print(aname); mPrinter.print("=\""); printAttributeValue(avalue); mPrinter.print('"'); } mPrinter.print('>'); mPrinter.flush(); // return original META[@http-equiv]/@content value if (contentIndex != -1) { attributes.setValue(contentIndex, originalContent); } } /** Print end element. */ protected void printEndElement(QName element) { mPrinter.print("'); mPrinter.flush(); } /** Print entity. */ protected void printEntity(String name) { mPrinter.print('&'); mPrinter.print(name); mPrinter.print(';'); mPrinter.flush(); } // // Private static methods // // NOTE: These methods are private because I have every intention // of removing them later to be replaced with something that // is designed better. -Ac /** Returns the name of the entity for the specified character. */ private static String entity(char c) { switch (c) { case 0x0026: return "amp"; case 0x003c: return "lt"; case 0x00a0: return "nbsp"; case 0x00a9: return "copy"; case 0x00ae: return "reg"; case 0x2014: return "mdash"; case 0x00a7: return "sect"; case 0x00b7: return "middot"; case 0x00e9: return "eacute"; case 0x003e: return "gt"; } return null; } // // MAIN // /** Main. */ public static void main(String[] argv) throws Exception { if (argv.length == 0) { printUsage(); System.exit(1); } NekoHTMLParser nekoParser; XMLParserConfiguration parser = new HTMLConfiguration(); parser.setFeature(NOTIFY_CHAR_REFS, true); parser.setFeature(NOTIFY_HTML_BUILTIN_REFS, true); String encoding = "Windows-1252"; boolean identity = false; for (int i = 0; i < argv.length; i++) { String arg = argv[i]; if (arg.equals("-e")) { encoding = argv[++i]; continue; } // if (arg.equals("-i")) // { // identity = true; // continue; // } if (arg.equals("-h")) { printUsage(); System.exit(1); } XMLDocumentFilter[] filters; // if (identity) // { // parser.setFeature(AUGMENTATIONS, true); // filters = new XMLDocumentFilter[] { // new Identity(), // new NekoHTMLParser(System.out, encoding) // }; // } // else { nekoParser = new NekoHTMLParser(System.out, encoding); filters = new XMLDocumentFilter[] { nekoParser }; } parser.setProperty(FILTERS, filters); parser.parse(new XMLInputSource(null, arg, null)); System.out.println("BODY:\n" + nekoParser.mBody); } } /** Print usage. */ private static void printUsage() { System.err.println("usage: java " + NekoHTMLParser.class.getName() + " (options) file ..."); System.err.println(); System.err.println("options:"); System.err.println(" -e name Specify IANA name of output encoding."); // System.err.println(" -i Perform identity transform."); System.err.println(" -h Display help screen."); } }