[cp-patches] FYI: HTML parsing fix

Roman Kennke Wed, 15 Nov 2006 05:36:21 -0800

This enhances the HTML parser and makes it more fault tolerant. Until
now HTML like the following would lead to a borked element structure
and/or exceptions beeing thrown:
<ul>
<li>adsa</li>
</li></li>
</ul>


(too many </li> tags).

This patch makes the parser more resistent against such code and yields
the correct structure:

<ul>
<li>adsa</li>
</ul>

... Make the planet renderable again.

2006-11-15  Roman Kennke  <[EMAIL PROTECTED]>

        * javax/swing/text/html/HTMLDocument.java
        (HTMLReader.ParagraphAction.end): Call super instead of blockClose()
        directly.
        (HTMLReader.ParagraphAction.start): Call super instead of blockOpen()
        directly.
        (HTMLReader.parseStack): Removed.
        (HTMLReader.blockClose): Simply call addContent() with ' '
        instead of doing more complicated stuff. Removed parseStack
        handling.
        (HTMLReader.blockOpen): Removed parseStack handling.
        (getInsertingReader): Removed parseStack init.
        * gnu/javax/swing/text/html/parser/htmlValidator.java
        (closeTag): Return true only when the tag actually should be
        closed.
        * gnu/javax/swing/text/html/parser/support/Parser.java
        (_handleEndTag): Only actually close the tag when the validator
        allows it.

/Roman

Index: javax/swing/text/html/HTMLDocument.java
===================================================================
RCS file: /cvsroot/classpath/classpath/javax/swing/text/html/HTMLDocument.java,v
retrieving revision 1.49
diff -u -1 -5 -r1.49 HTMLDocument.java
--- javax/swing/text/html/HTMLDocument.java	11 Nov 2006 11:02:07 -0000	1.49
+++ javax/swing/text/html/HTMLDocument.java	15 Nov 2006 13:31:01 -0000
@@ -511,36 +511,30 @@
    * @author Anthony Balkissoon abalkiss at redhat dot com
    */
   public class HTMLReader extends HTMLEditorKit.ParserCallback
   {    
     /**
      * Holds the current character attribute set *
      */
     protected MutableAttributeSet charAttr = new SimpleAttributeSet();
     
     protected Vector parseBuffer = new Vector();
     
     /** 
      * A stack for character attribute sets *
      */
     Stack charAttrStack = new Stack();
-
-    /**
-     * The parse stack. This stack holds HTML.Tag objects that reflect the
-     * current position in the parsing process.
-     */
-    Stack parseStack = new Stack();
    
     /** A mapping between HTML.Tag objects and the actions that handle them **/
     HashMap tagToAction;
     
     /** Tells us whether we've received the '</html>' tag yet **/
     boolean endHTMLEncountered = false;
     
     /** 
      * Related to the constructor with explicit insertTag 
      */
     int popDepth;
     
     /**
      * Related to the constructor with explicit insertTag
      */    
@@ -804,41 +798,41 @@
       {
         blockOpen(HTML.Tag.IMPLIED, new SimpleAttributeSet());
         addSpecialElement(t, a);
         blockClose(HTML.Tag.IMPLIED);
       }
     }
     
     public class ParagraphAction extends BlockAction
     {
       /**
        * This method is called when a start tag is seen for one of the types
        * of tags associated with this Action.
        */
       public void start(HTML.Tag t, MutableAttributeSet a)
       {
-        blockOpen(t, a);
+        super.start(t, a);
         inParagraph = true;
       }
       
       /**
        * Called when an end tag is seen for one of the types of tags associated
        * with this Action.
        */
       public void end(HTML.Tag t)
       {
-        blockClose(t);
+        super.end(t);
         inParagraph = false;
       } 
     }
 
     /**
      * This action is performed when a &lt;pre&gt; tag is parsed.
      */
     public class PreAction extends BlockAction
     {
       /**
        * This method is called when a start tag is seen for one of the types
        * of tags associated with this Action.
        */
       public void start(HTML.Tag t, MutableAttributeSet a)
       {
@@ -1500,31 +1494,30 @@
     
     /**
      * Instructs the parse buffer to create a block element with the given 
      * attributes.
      * 
      * @param t the tag that requires opening a new block
      * @param attr the attribute set for the new block
      */
     protected void blockOpen(HTML.Tag t, MutableAttributeSet attr)
     {
       if (inImpliedParagraph)
         blockClose(HTML.Tag.IMPLIED);
 
       DefaultStyledDocument.ElementSpec element;
 
-      parseStack.push(t);
       AbstractDocument.AttributeContext ctx = getAttributeContext();
       AttributeSet copy = attr.copyAttributes();
       copy = ctx.addAttribute(copy, StyleConstants.NameAttribute, t);
       element = new DefaultStyledDocument.ElementSpec(copy,
                                DefaultStyledDocument.ElementSpec.StartTagType);
       parseBuffer.addElement(element);
     }
 
     /**
      * Instructs the parse buffer to close the block element associated with 
      * the given HTML.Tag
      * 
      * @param t the HTML.Tag that is closing its block
      */
     protected void blockClose(HTML.Tag t)
@@ -1534,45 +1527,36 @@
       if (inImpliedParagraph)
         {
           inImpliedParagraph = false;
           inParagraph = false;
           if (t != HTML.Tag.IMPLIED)
             blockClose(HTML.Tag.IMPLIED);
         }
 
       // If the previous tag is a start tag then we insert a synthetic
       // content tag.
       DefaultStyledDocument.ElementSpec prev;
       prev = (DefaultStyledDocument.ElementSpec)
 	      parseBuffer.get(parseBuffer.size() - 1);
       if (prev.getType() == DefaultStyledDocument.ElementSpec.StartTagType)
         {
-          AbstractDocument.AttributeContext ctx = getAttributeContext();
-          AttributeSet attributes = ctx.getEmptySet();
-          attributes = ctx.addAttribute(attributes, StyleConstants.NameAttribute,
-                                        HTML.Tag.CONTENT);
-          element = new DefaultStyledDocument.ElementSpec(attributes,
-			  DefaultStyledDocument.ElementSpec.ContentType,
-                                    new char[0], 0, 0);
-          parseBuffer.add(element);
+          addContent(new char[]{' '}, 0, 1);
         }
 
       element = new DefaultStyledDocument.ElementSpec(null,
 				DefaultStyledDocument.ElementSpec.EndTagType);
       parseBuffer.addElement(element);
-      if (parseStack.size() > 0)
-        parseStack.pop();
     }
     
     /**
      * Adds text to the appropriate context using the current character
      * attribute set.
      * 
      * @param data the text to add
      * @param offs the offset at which to add it
      * @param length the length of the text to add
      */
     protected void addContent(char[] data, int offs, int length)
     {
       addContent(data, offs, length, true);
     }
     
@@ -1722,34 +1706,30 @@
       {
         if (t != HTML.Tag.BODY)
           super.handleStartTag(t, a, pos);
       }
 
       /**
        * Ignore BODY.
        */
       public void handleEndTag(HTML.Tag t, int pos)
       {
         if (t != HTML.Tag.BODY)
           super.handleEndTag(t, pos);
       }
     };
       
-    // Set the parent HTML tag.
-    reader.parseStack.push(parent.getAttributes().getAttribute(
-      StyleConstants.NameAttribute));
-
     return reader;
   }   
   
   /**
    * Gets the child element that contains the attribute with the value or null.
    * Not thread-safe.
    * 
    * @param e - the element to begin search at
    * @param attribute - the desired attribute
    * @param value - the desired value
    * @return the element found with the attribute and value specified or null if
    *         it is not found.
    */
   public Element getElement(Element e, Object attribute, Object value)
   {
Index: gnu/javax/swing/text/html/parser/htmlValidator.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/javax/swing/text/html/parser/htmlValidator.java,v
retrieving revision 1.5
diff -u -1 -5 -r1.5 htmlValidator.java
--- gnu/javax/swing/text/html/parser/htmlValidator.java	16 Jul 2006 18:25:53 -0000	1.5
+++ gnu/javax/swing/text/html/parser/htmlValidator.java	15 Nov 2006 13:31:02 -0000
@@ -141,31 +141,31 @@
       {
         h = (hTag) stack.getLast();
         if (!h.forcibly_closed && !h.element.omitEnd())
           s_error("Unclosed <" + h.tag + ">, closing at the end of stream");
 
         handleSupposedEndTag(h.element);
 
         closeTag(h.tgElement);
       }
   }
 
   /**
    * Remove the given tag from the stack or (if found) from the list
    * of the forcibly closed tags.
    */
-  public void closeTag(TagElement tElement)
+  public boolean closeTag(TagElement tElement)
   {
     HTML.Tag tag = tElement.getHTMLTag();
     hTag x;
     hTag close;
 
     if (!stack.isEmpty())
       {
         ListIterator iter = stack.listIterator(stack.size());
 
         while (iter.hasPrevious())
           {
             x = (hTag) iter.previous();
             if (tag.equals(x.tag))
               {
                 if (x.forcibly_closed && !x.element.omitEnd())
@@ -179,35 +179,36 @@
                 closing: 
                 if (x.element.content != null)
                   {
                     iter = stack.listIterator(stack.size());
                     while (iter.hasPrevious())
                       {
                         close = (hTag) iter.previous();
                         if (close == x)
                           break closing;
                         handleSupposedEndTag(close.element);
                         iter.remove();
                       }
                   }
 
                 stack.remove(x);
-                return;
+                return true;
               }
           }
       }
     s_error("Closing unopened <" + tag + ">");
+    return false;
   }
 
   /**
    * Add the given HTML tag to the stack of the opened tags. Forcibly closes
    * all tags in the stack that does not allow this tag in they content (error
    * is reported).
    * @param element
    */
   public void openTag(TagElement tElement, htmlAttributeSet parameters)
   {
     // If this is a fictional call, the message from the parser
     // has recursively returned - ignore.
     if (tElement.fictional())
       return;
 
Index: gnu/javax/swing/text/html/parser/support/Parser.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/javax/swing/text/html/parser/support/Parser.java,v
retrieving revision 1.10
diff -u -1 -5 -r1.10 Parser.java
--- gnu/javax/swing/text/html/parser/support/Parser.java	15 Nov 2006 10:52:16 -0000	1.10
+++ gnu/javax/swing/text/html/parser/support/Parser.java	15 Nov 2006 13:31:02 -0000
@@ -1188,32 +1188,32 @@
       }
     catch (ChangedCharSetException ex)
       {
         error("Changed charset exception:", ex.getMessage());
       }
   }
 
   /**
    * A hooks for operations, preceeding call to handleEndTag().
    * The method is called when the HTML closing tag
    * is found. Calls handleTitle after closing the 'title' tag.
    * @param The tag
    */
   private void _handleEndTag(TagElement tag)
   {
-    validator.closeTag(tag);
-    _handleEndTag_remaining(tag);
+    if (validator.closeTag(tag))
+       _handleEndTag_remaining(tag);
   }
 
   /**
    * Actions that are also required if the closing action was
    * initiated by the tag validator.
    * Package-private to avoid an accessor method.
    */
   void _handleEndTag_remaining(TagElement tag)
   {
     HTML.Tag h = tag.getHTMLTag();
 
     handleEndTag(tag);
     endTag(tag.fictional());
 
     if (h.isPreformatted())
Index: gnu/javax/swing/text/html/parser/support/textPreProcessor.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/javax/swing/text/html/parser/support/textPreProcessor.java,v
retrieving revision 1.3
diff -u -1 -5 -r1.3 textPreProcessor.java
--- gnu/javax/swing/text/html/parser/support/textPreProcessor.java	3 Sep 2006 20:42:43 -0000	1.3
+++ gnu/javax/swing/text/html/parser/support/textPreProcessor.java	15 Nov 2006 13:31:02 -0000
@@ -53,46 +53,38 @@
    * consumed. The content of the passed buffer is destroyed.
    * 
    * @param a_text A text to pre-process.
    */
   public char[] preprocess(StringBuffer a_text)
   {
     if (a_text.length() == 0)
       return null;
 
     char[] text = toCharArray(a_text);
 
     int a = 0;
     int b = text.length - 1;
 
     // Remove leading/trailing whitespace, leaving at most one character
-    try
-      {
-        while (Constants.bWHITESPACE.get(text[a])
-               && Constants.bWHITESPACE.get(text[a + 1]))
-          a++;
+    int len = text.length;
+    while (a + 1 < len && Constants.bWHITESPACE.get(text[a])
+           && Constants.bWHITESPACE.get(text[a + 1]))
+      a++;
 
-        while (b > a && Constants.bWHITESPACE.get(text[b])
+    while (b > a && Constants.bWHITESPACE.get(text[b])
                && Constants.bWHITESPACE.get(text[b - 1]))
-          b--;
-      }
-    catch (ArrayIndexOutOfBoundsException sx)
-      {
-        // A text fragment, consisting from spaces and line breaks only,
-        // mutates into single space.
-        return new char[] { ' ' };
-      }
+      b--;
 
     a_text.setLength(0);
 
     boolean spacesWere = false;
     boolean spaceNow;
     char c;
 
     chars: for (int i = a; i <= b; i++)
       {
         c = text[i];
         spaceNow = Constants.bWHITESPACE.get(c);
         if (spacesWere && spaceNow)
           continue chars;
         if (spaceNow)
           a_text.append(' ');

[cp-patches] FYI: HTML parsing fix

Reply via email to