There is an issue in the HtmlCleaner tracker for this:

http://sourceforge.net/tracker/index.php?func=detail&aid=3190583&group_id=183053&atid=903696

On 7 Oct 2011, at 12:20, [email protected] wrote:

> Author: psharples
> Date: Fri Oct  7 11:20:23 2011
> New Revision: 1180006
> 
> URL: http://svn.apache.org/viewvc?rev=1180006&view=rev
> Log:
> Fix for HTMLCleaners bad rewriting of HTML5 doctypes. (it originally added a 
> null and an empty string to <!DOCTYPE html> type declarations.
> 
> Added:
>    
> incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java 
>   (with props)
> Modified:
>    incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java
> 
> Added: 
> incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java
> URL: 
> http://svn.apache.org/viewvc/incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java?rev=1180006&view=auto
> ==============================================================================
> --- 
> incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java 
> (added)
> +++ 
> incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java 
> Fri Oct  7 11:20:23 2011
> @@ -0,0 +1,42 @@
> +/*
> + *  Licensed under the Apache License, Version 2.0 (the "License");
> + *  you may not use this file except in compliance with the License.
> + *  You may obtain a copy of the License at
> + *
> + *      http://www.apache.org/licenses/LICENSE-2.0
> + *
> + *  Unless required by applicable law or agreed to in writing, software
> + *  distributed under the License is distributed on an "AS IS" BASIS,
> + *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + *  See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +package org.apache.wookie.util.html;
> +
> +import org.htmlcleaner.DoctypeToken;
> +
> +/**
> + * 
> + * An extended HTML Cleaner DocTypeToken class to deal with HTML5 
> declarations better then the default, which displays emtpy strings and nulls.
> + * 
> + * Note: <!DOCTYPE html SYSTEM "about:legacy-compat"> is also a valid HTML5 
> doctype - but html cleaner only makes the html
> + * into uppercase, which although is still not correct, doesn't seem to 
> cause problems in wookie at present.
> + * 
> + * 
> http://sourceforge.net/tracker/?func=detail&aid=3190583&group_id=183053&atid=903696
> + * 
> + */
> +public class Html5DoctypeToken extends DoctypeToken {
> +     
> +     public static String BADDOCTYPE = "<!DOCTYPE HTML null \"\">";
> +     public static String GOODDOCTYPE = "<!DOCTYPE html>";
> +
> +     public Html5DoctypeToken(String part1, String part2, String part3,
> +                     String part4) {
> +             super(part1, part2, part3, part4);              
> +     }
> +             
> +     public String getContent(){
> +             return GOODDOCTYPE;
> +     }
> +
> +}
> 
> Propchange: 
> incubator/wookie/trunk/src/org/apache/wookie/util/html/Html5DoctypeToken.java
> ------------------------------------------------------------------------------
>    svn:mime-type = text/plain
> 
> Modified: 
> incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java
> URL: 
> http://svn.apache.org/viewvc/incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java?rev=1180006&r1=1180005&r2=1180006&view=diff
> ==============================================================================
> --- incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java 
> (original)
> +++ incubator/wookie/trunk/src/org/apache/wookie/util/html/HtmlCleaner.java 
> Fri Oct  7 11:20:23 2011
> @@ -20,6 +20,7 @@ import java.util.ArrayList;
> import java.util.List;
> 
> import org.htmlcleaner.CleanerProperties;
> +import org.htmlcleaner.DoctypeToken;
> import org.htmlcleaner.TagNode;
> 
> /**
> @@ -64,11 +65,12 @@ public class HtmlCleaner implements IHtm
>       public void setReader(Reader reader) throws IOException{
>               if (reader == null) throw new IOException("Reader was null");
>               this.reader = reader;
> -             htmlNode = cleaner.clean(this.reader);                  
> +             htmlNode = cleaner.clean(this.reader);
>               headNode = htmlNode.findElementByName(HEAD_TAG, false); 
>               // remove widget-specific scripts. These will be replaced
>               // after processing, so that the injected scripts come first
>               removeUserScripts();
> +             fixHTML5Doctype();
>       }
>       
>       /* (non-Javadoc)
> @@ -167,5 +169,17 @@ public class HtmlCleaner implements IHtm
>                       headNode.addChild(node);
>               }
>       }
> +     
> +     /**
> +      *  Fix for a bug in HTMLCleaner which cannot handle HTML5 doctypes 
> correctly
> +      *  See 
> http://sourceforge.net/tracker/?func=detail&aid=3190583&group_id=183053&atid=903696
> +      */
> +     private void fixHTML5Doctype(){
> +             DoctypeToken docType = htmlNode.getDocType();
> +             
> if(docType.getContent().equalsIgnoreCase(Html5DoctypeToken.BADDOCTYPE)){
> +                     Html5DoctypeToken newToken = new 
> Html5DoctypeToken("html",null,null,null);
> +                     htmlNode.setDocType(newToken);
> +             }
> +     }
> 
> }
> 
> 

Reply via email to