[cp-patches] FYI: HTML parser fix

Roman Kennke Wed, 15 Nov 2006 02:55:54 -0800

This fixes this nasty problems in the HTML parser wrt to whitespace. I
found some comments in the specs where it reads that all whitespace
immediately following an opening tag and immediately preceding a closing
tag must be discarded, and indeed, that solves all the problems I had
(at least those with whitespace).


2006-11-15  Roman Kennke  <[EMAIL PROTECTED]>

        * gnu/javax/swing/text/html/parser/support/Parser.java
        (_handleText): Consume whitespace directly before a closing tag.
        (restOfTag): Consume whitespace directly after opening.
        * gnu/javax/swing/text/html/parser/support/textPreProcessor.java
        (preprocess): Don't perform array boundary checking by
        catch AIOOBE, instead check the boundary in loop condition.
        * gnu/javax/swing/text/html/parser/support/low/Constants.java
        (TAG_CLOSE): New constants. Describes the token pattern for
        a closing tag.

/Roman

Index: gnu/javax/swing/text/html/parser/support/Parser.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/javax/swing/text/html/parser/support/Parser.java,v
retrieving revision 1.9
diff -u -1 -5 -r1.9 Parser.java
--- gnu/javax/swing/text/html/parser/support/Parser.java	7 Nov 2006 23:44:11 -0000	1.9
+++ gnu/javax/swing/text/html/parser/support/Parser.java	15 Nov 2006 10:51:43 -0000
@@ -647,31 +647,34 @@
    * In non - preformatted mode, all line breaks immediately following the
    * start tag and immediately before an end tag is discarded,
    * \r, \n and \t are replaced by spaces, multiple space are replaced
    * by the single one and the result is  moved into array,
    * passing it  to handleText().
    */
   protected void _handleText()
   {
     char[] text;
 
     if (preformatted > 0)
       text = textProcessor.preprocessPreformatted(buffer);
     else
       text = textProcessor.preprocess(buffer);
 
-    if (text != null && text.length > 0)
+    if (text != null && text.length > 0
+        // According to the specs we need to discard whitespace immediately
+        // before a closing tag.
+        && (text.length > 1 || (text[0] == ' ' && ! TAG_CLOSE.matches(this))))
       {
         TagElement pcdata = new TagElement(dtd.getElement("#pcdata"));
         if ((text.length > 1 && text[0] != ' ')
             || validator.tagIsValidForContext(pcdata) == Boolean.TRUE)
           {
             attributes = htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET;
             _handleEmptyTag(pcdata);
 
             handleText(text);
             if (titleOpen)
               title.append(text);
           }
       }
   }
 
@@ -1448,31 +1451,36 @@
             forciblyCloseTheTag();
           }
       }
 
     if (closing)
       {
         endTag(false);
         _handleEndTag(makeTagElement(name.getImage(), false));
       }
     else
       {
         TagElement te = makeTagElement(name.getImage(), false);
         if (te.getElement().type == DTDConstants.EMPTY)
           _handleEmptyTag(te);
         else
-          _handleStartTag(te);
+          {
+            // According to the specs we need to consume whitespace following
+            // immediately after a opening tag.
+            optional(WS);
+            _handleStartTag(te);
+          }
       }
   }
 
   /**
    * This should fire additional actions in response to the
    * ChangedCharSetException.  The current implementation
    * does nothing.
    * @param tag
    */
   private void startingTag(TagElement tag)
   {
     try
       {
         startTag(tag);
       }
Index: gnu/javax/swing/text/html/parser/support/textPreProcessor.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/javax/swing/text/html/parser/support/textPreProcessor.java,v
retrieving revision 1.3
diff -u -1 -5 -r1.3 textPreProcessor.java
--- gnu/javax/swing/text/html/parser/support/textPreProcessor.java	3 Sep 2006 20:42:43 -0000	1.3
+++ gnu/javax/swing/text/html/parser/support/textPreProcessor.java	15 Nov 2006 10:51:43 -0000
@@ -53,46 +53,38 @@
    * consumed. The content of the passed buffer is destroyed.
    * 
    * @param a_text A text to pre-process.
    */
   public char[] preprocess(StringBuffer a_text)
   {
     if (a_text.length() == 0)
       return null;
 
     char[] text = toCharArray(a_text);
 
     int a = 0;
     int b = text.length - 1;
 
     // Remove leading/trailing whitespace, leaving at most one character
-    try
-      {
-        while (Constants.bWHITESPACE.get(text[a])
-               && Constants.bWHITESPACE.get(text[a + 1]))
-          a++;
+    int len = text.length;
+    while (a + 1 < len && Constants.bWHITESPACE.get(text[a])
+           && Constants.bWHITESPACE.get(text[a + 1]))
+      a++;
 
-        while (b > a && Constants.bWHITESPACE.get(text[b])
+    while (b > a && Constants.bWHITESPACE.get(text[b])
                && Constants.bWHITESPACE.get(text[b - 1]))
-          b--;
-      }
-    catch (ArrayIndexOutOfBoundsException sx)
-      {
-        // A text fragment, consisting from spaces and line breaks only,
-        // mutates into single space.
-        return new char[] { ' ' };
-      }
+      b--;
 
     a_text.setLength(0);
 
     boolean spacesWere = false;
     boolean spaceNow;
     char c;
 
     chars: for (int i = a; i <= b; i++)
       {
         c = text[i];
         spaceNow = Constants.bWHITESPACE.get(c);
         if (spacesWere && spaceNow)
           continue chars;
         if (spaceNow)
           a_text.append(' ');
Index: gnu/javax/swing/text/html/parser/support/low/Constants.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/javax/swing/text/html/parser/support/low/Constants.java,v
retrieving revision 1.2
diff -u -1 -5 -r1.2 Constants.java
--- gnu/javax/swing/text/html/parser/support/low/Constants.java	2 Jul 2005 20:32:15 -0000	1.2
+++ gnu/javax/swing/text/html/parser/support/low/Constants.java	15 Nov 2006 10:51:43 -0000
@@ -197,30 +197,41 @@
                   new node(END)
                 }
                );
 
   /**
    * Ordinary HTML tag heading pattern.
    */
   public static final pattern TAG =
     new pattern(new node[]
                 {
                   new node(BEGIN), new node(WS, true), new node(SLASH, true),
                   new node(WS, true), new node(NUMTOKEN)
                 }
                );
 
+  /**
+   * Ordinary HTML tag closing pattern.
+   */
+  public static final pattern TAG_CLOSE =
+    new pattern(new node[]
+                {
+                  new node(BEGIN), new node(WS, true), new node(SLASH),
+                  new node(WS, true), new node(NUMTOKEN)
+                }
+               );
+
   /* Special tokens */
 
   /**
    * All other tokens.
    */
   public static final int OTHER = 1999;
 
   /**
    * The UNICODE "end of text" control code
    */
   static final char ETX = 3;
 
   /**
    * End of file.
    */

[cp-patches] FYI: HTML parser fix

Reply via email to