This fixes this nasty problems in the HTML parser wrt to whitespace. I
found some comments in the specs where it reads that all whitespace
immediately following an opening tag and immediately preceding a closing
tag must be discarded, and indeed, that solves all the problems I had
(at least those with whitespace).
2006-11-15 Roman Kennke <[EMAIL PROTECTED]>
* gnu/javax/swing/text/html/parser/support/Parser.java
(_handleText): Consume whitespace directly before a closing tag.
(restOfTag): Consume whitespace directly after opening.
* gnu/javax/swing/text/html/parser/support/textPreProcessor.java
(preprocess): Don't perform array boundary checking by
catch AIOOBE, instead check the boundary in loop condition.
* gnu/javax/swing/text/html/parser/support/low/Constants.java
(TAG_CLOSE): New constants. Describes the token pattern for
a closing tag.
/Roman
Index: gnu/javax/swing/text/html/parser/support/Parser.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/javax/swing/text/html/parser/support/Parser.java,v
retrieving revision 1.9
diff -u -1 -5 -r1.9 Parser.java
--- gnu/javax/swing/text/html/parser/support/Parser.java 7 Nov 2006 23:44:11 -0000 1.9
+++ gnu/javax/swing/text/html/parser/support/Parser.java 15 Nov 2006 10:51:43 -0000
@@ -647,31 +647,34 @@
* In non - preformatted mode, all line breaks immediately following the
* start tag and immediately before an end tag is discarded,
* \r, \n and \t are replaced by spaces, multiple space are replaced
* by the single one and the result is moved into array,
* passing it to handleText().
*/
protected void _handleText()
{
char[] text;
if (preformatted > 0)
text = textProcessor.preprocessPreformatted(buffer);
else
text = textProcessor.preprocess(buffer);
- if (text != null && text.length > 0)
+ if (text != null && text.length > 0
+ // According to the specs we need to discard whitespace immediately
+ // before a closing tag.
+ && (text.length > 1 || (text[0] == ' ' && ! TAG_CLOSE.matches(this))))
{
TagElement pcdata = new TagElement(dtd.getElement("#pcdata"));
if ((text.length > 1 && text[0] != ' ')
|| validator.tagIsValidForContext(pcdata) == Boolean.TRUE)
{
attributes = htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET;
_handleEmptyTag(pcdata);
handleText(text);
if (titleOpen)
title.append(text);
}
}
}
@@ -1448,31 +1451,36 @@
forciblyCloseTheTag();
}
}
if (closing)
{
endTag(false);
_handleEndTag(makeTagElement(name.getImage(), false));
}
else
{
TagElement te = makeTagElement(name.getImage(), false);
if (te.getElement().type == DTDConstants.EMPTY)
_handleEmptyTag(te);
else
- _handleStartTag(te);
+ {
+ // According to the specs we need to consume whitespace following
+ // immediately after a opening tag.
+ optional(WS);
+ _handleStartTag(te);
+ }
}
}
/**
* This should fire additional actions in response to the
* ChangedCharSetException. The current implementation
* does nothing.
* @param tag
*/
private void startingTag(TagElement tag)
{
try
{
startTag(tag);
}
Index: gnu/javax/swing/text/html/parser/support/textPreProcessor.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/javax/swing/text/html/parser/support/textPreProcessor.java,v
retrieving revision 1.3
diff -u -1 -5 -r1.3 textPreProcessor.java
--- gnu/javax/swing/text/html/parser/support/textPreProcessor.java 3 Sep 2006 20:42:43 -0000 1.3
+++ gnu/javax/swing/text/html/parser/support/textPreProcessor.java 15 Nov 2006 10:51:43 -0000
@@ -53,46 +53,38 @@
* consumed. The content of the passed buffer is destroyed.
*
* @param a_text A text to pre-process.
*/
public char[] preprocess(StringBuffer a_text)
{
if (a_text.length() == 0)
return null;
char[] text = toCharArray(a_text);
int a = 0;
int b = text.length - 1;
// Remove leading/trailing whitespace, leaving at most one character
- try
- {
- while (Constants.bWHITESPACE.get(text[a])
- && Constants.bWHITESPACE.get(text[a + 1]))
- a++;
+ int len = text.length;
+ while (a + 1 < len && Constants.bWHITESPACE.get(text[a])
+ && Constants.bWHITESPACE.get(text[a + 1]))
+ a++;
- while (b > a && Constants.bWHITESPACE.get(text[b])
+ while (b > a && Constants.bWHITESPACE.get(text[b])
&& Constants.bWHITESPACE.get(text[b - 1]))
- b--;
- }
- catch (ArrayIndexOutOfBoundsException sx)
- {
- // A text fragment, consisting from spaces and line breaks only,
- // mutates into single space.
- return new char[] { ' ' };
- }
+ b--;
a_text.setLength(0);
boolean spacesWere = false;
boolean spaceNow;
char c;
chars: for (int i = a; i <= b; i++)
{
c = text[i];
spaceNow = Constants.bWHITESPACE.get(c);
if (spacesWere && spaceNow)
continue chars;
if (spaceNow)
a_text.append(' ');
Index: gnu/javax/swing/text/html/parser/support/low/Constants.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/javax/swing/text/html/parser/support/low/Constants.java,v
retrieving revision 1.2
diff -u -1 -5 -r1.2 Constants.java
--- gnu/javax/swing/text/html/parser/support/low/Constants.java 2 Jul 2005 20:32:15 -0000 1.2
+++ gnu/javax/swing/text/html/parser/support/low/Constants.java 15 Nov 2006 10:51:43 -0000
@@ -197,30 +197,41 @@
new node(END)
}
);
/**
* Ordinary HTML tag heading pattern.
*/
public static final pattern TAG =
new pattern(new node[]
{
new node(BEGIN), new node(WS, true), new node(SLASH, true),
new node(WS, true), new node(NUMTOKEN)
}
);
+ /**
+ * Ordinary HTML tag closing pattern.
+ */
+ public static final pattern TAG_CLOSE =
+ new pattern(new node[]
+ {
+ new node(BEGIN), new node(WS, true), new node(SLASH),
+ new node(WS, true), new node(NUMTOKEN)
+ }
+ );
+
/* Special tokens */
/**
* All other tokens.
*/
public static final int OTHER = 1999;
/**
* The UNICODE "end of text" control code
*/
static final char ETX = 3;
/**
* End of file.
*/