There is an issue in the HtmlCleaner tracker for this: http://sourceforge.net/tracker/index.php?func=detail&aid=3190583&group_id=183053&atid=903696
On 7 Oct 2011, at 12:20, [email protected] wrote: > Author: psharples > Date: Fri Oct 7 11:20:23 2011 > New Revision: 1180006 > > URL: http://svn.apache.org/viewvc?rev=1180006&view=rev > Log: > Fix for HTMLCleaners bad rewriting of HTML5 doctypes. (it originally added a > null and an empty string to <!DOCTYPE html> type declarations. > > Added: > > incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java > (with props) > Modified: > incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java > > Added: > incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java > URL: > http://svn.apache.org/viewvc/incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java?rev=1180006&view=auto > ============================================================================== > --- > incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java > (added) > +++ > incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java > Fri Oct 7 11:20:23 2011 > @@ -0,0 +1,42 @@ > +/* > + * Licensed under the Apache License, Version 2.0 (the "License"); > + * you may not use this file except in compliance with the License. > + * You may obtain a copy of the License at > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. > + * See the License for the specific language governing permissions and > + * limitations under the License. > + */ > +package org.apache.wookie.util.html; > + > +import org.htmlcleaner.DoctypeToken; > + > +/** > + * > + * An extended HTML Cleaner DocTypeToken class to deal with HTML5 > declarations better then the default, which displays emtpy strings and nulls. > + * > + * Note: <!DOCTYPE html SYSTEM "about:legacy-compat"> is also a valid HTML5 > doctype - but html cleaner only makes the html > + * into uppercase, which although is still not correct, doesn't seem to > cause problems in wookie at present. > + * > + * > http://sourceforge.net/tracker/?func=detail&aid=3190583&group_id=183053&atid=903696 > + * > + */ > +public class Html5DoctypeToken extends DoctypeToken { > + > + public static String BADDOCTYPE = "<!DOCTYPE HTML null \"\">"; > + public static String GOODDOCTYPE = "<!DOCTYPE html>"; > + > + public Html5DoctypeToken(String part1, String part2, String part3, > + String part4) { > + super(part1, part2, part3, part4); > + } > + > + public String getContent(){ > + return GOODDOCTYPE; > + } > + > +} > > Propchange: > incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java > ------------------------------------------------------------------------------ > svn:mime-type = text/plain > > Modified: > incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java > URL: > http://svn.apache.org/viewvc/incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java?rev=1180006&r1=1180005&r2=1180006&view=diff > ============================================================================== > --- incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java > (original) > +++ incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java > Fri Oct 7 11:20:23 2011 > @@ -20,6 +20,7 @@ import java.util.ArrayList; > import java.util.List; > > import org.htmlcleaner.CleanerProperties; > +import org.htmlcleaner.DoctypeToken; > import org.htmlcleaner.TagNode; > > /** > @@ -64,11 +65,12 @@ public class HtmlCleaner implements IHtm > public void setReader(Reader reader) throws IOException{ > if (reader == null) throw new IOException("Reader was null"); > this.reader = reader; > - htmlNode = cleaner.clean(this.reader); > + htmlNode = cleaner.clean(this.reader); > headNode = htmlNode.findElementByName(HEAD_TAG, false); > // remove widget-specific scripts. These will be replaced > // after processing, so that the injected scripts come first > removeUserScripts(); > + fixHTML5Doctype(); > } > > /* (non-Javadoc) > @@ -167,5 +169,17 @@ public class HtmlCleaner implements IHtm > headNode.addChild(node); > } > } > + > + /** > + * Fix for a bug in HTMLCleaner which cannot handle HTML5 doctypes > correctly > + * See > http://sourceforge.net/tracker/?func=detail&aid=3190583&group_id=183053&atid=903696 > + */ > + private void fixHTML5Doctype(){ > + DoctypeToken docType = htmlNode.getDocType(); > + > if(docType.getContent().equalsIgnoreCase(Html5DoctypeToken.BADDOCTYPE)){ > + Html5DoctypeToken newToken = new > Html5DoctypeToken("html",null,null,null); > + htmlNode.setDocType(newToken); > + } > + } > > } > >
