ReducedHTMLParser: incorrect assumption about STATE_EXPECTING_ETAGO
-------------------------------------------------------------------
Key: TOMAHAWK-1458
URL: https://issues.apache.org/jira/browse/TOMAHAWK-1458
Project: MyFaces Tomahawk
Issue Type: Bug
Components: ExtensionsFilter
Affects Versions: 1.1.9
Reporter: Lutz Ulruch
ReducedHTMLParser assumes that <script> elements cannot contain "</" before
</script>. This is not true.
Raw example:
<script>
function foo() { var str = "</tr>; }"
</script>
In this case, ReducedHTMLParser switches to STATE_READY when "</tr>" is
handled. But the <script> element is not closed here.
Below, I provide a patch for the parse() method.
Note that this patch still works incorrectly if the script contains the string
"</script>", like in
<script>
function foo() { var str = "</script>; }"
</script>
Patch (my changes indicated by comments // L. Ulrich ...)
void parse()
{
int state = STATE_READY;
int currentTagStart = -1;
String currentTagName = null;
_lineNumber = 1;
_offset = 0;
int lastOffset = _offset -1;
// L. Ulrich, 23.09.2009:
// New helper variable which holds the tag name
// in case of STATE_EXPECTING_ETAGO
String currentEtagoTagName = null;
while (_offset < _seq.length())
{
// Sanity check; each pass through this loop must increase the
offset.
// Failure to do this means a hang situation has occurred.
if (_offset <= lastOffset)
{
// throw new RuntimeException("Infinite loop detected in
ReducedHTMLParser");
log.error("Infinite loop detected in ReducedHTMLParser; parsing
skipped."+
" Surroundings: '" + getTagSurroundings() +"'.");
//return;
}
lastOffset = _offset;
if (state == STATE_READY)
{
// in this state, nothing but "<" has any significance
consumeExcept("<");
if (isFinished())
{
break;
}
if (consumeMatch("<!--"))
{
// Note that whitespace is *not* permitted in <!--
state = STATE_IN_COMMENT;
}
else if (consumeMatch("<!["))
{
// Start of a "marked section", eg "<![CDATA" or
// "<![INCLUDE" or "<![IGNORE". These always terminate
// with "]]>"
log.debug("Marked section found at line " +
getCurrentLineNumber()+". "+
"Surroundings: '" + getTagSurroundings() +"'.");
state = STATE_IN_MARKED_SECTION;
}
else if (consumeMatch("<!DOCTYPE"))
{
log.debug("DOCTYPE found at line " +
getCurrentLineNumber());
// we don't need to actually do anything here; the
// tag can't contain a bare "<", so the first "<"
// indicates the start of the next real tag.
//
// TODO: Handle case where the DOCTYPE includes an internal
DTD. In
// that case there *will* be embedded < chars in the
document. However
// that's very unlikely to be used in a JSF page, so this
is pretty low
// priority.
}
else if (consumeMatch("<?"))
{
// xml processing instruction or <!DOCTYPE> tag
// we don't need to actually do anything here; the
// tag can't contain a bare "<", so the first "<"
// indicates the start of the next real tag.
log.debug("PI found at line " + getCurrentLineNumber());
}
else if (consumeMatch("</"))
{
if (!processEndTag())
{
// message already logged
return;
}
// stay in state READY
state = STATE_READY;
}
else if (consumeMatch("<"))
{
// We can't tell the user that the tag has closed until
after we have
// processed any attributes and found the real end of the
tag. So save
// the current info until the end of this tag.
currentTagStart = _offset - 1;
currentTagName = consumeElementName();
if (currentTagName == null)
{
log.warn("Invalid HTML; bare lessthan sign found at
line "
+ getCurrentLineNumber() + ". "+
"Surroundings: '" + getTagSurroundings()
+"'.");
// remain in STATE_READY; this isn't really the start of
// an xml element.
}
else
{
state = STATE_IN_TAG;
}
}
else
{
// should never get here
throw new Error("Internal error at line " +
getCurrentLineNumber());
}
continue;
}
if (state == STATE_IN_COMMENT)
{
// TODO: handle "-- >", which is a valid way to close a
// comment according to the specs.
// in this state, nothing but "--" has any significance
consumeExcept("-");
if (isFinished())
{
break;
}
if (consumeMatch("-->"))
{
state = STATE_READY;
}
else
{
// false call; hyphen is not end of comment
consumeMatch("-");
}
continue;
}
if (state == STATE_IN_TAG)
{
consumeWhitespace();
if (consumeMatch("/>"))
{
// ok, end of element
state = STATE_READY;
closedTag(currentTagStart, _offset, currentTagName);
// and reset vars just in case...
currentTagStart = -1;
currentTagName = null;
}
else if (consumeMatch(">"))
{
if (currentTagName.equalsIgnoreCase("script")
|| currentTagName.equalsIgnoreCase("style"))
{
// We've just started a special tag which can contain
anything except
// the ETAGO marker ("</"). See
//
http://www.w3.org/TR/REC-html40/appendix/notes.html#notes-specifying-data
state = STATE_EXPECTING_ETAGO;
// L. Ulrich, 23.09.2009:
// set currentEtagoTagName
currentEtagoTagName = currentTagName;
}
else
{
state = STATE_READY;
}
// end of open tag, but not end of element
openedTag(currentTagStart, _offset, currentTagName);
// and reset vars just in case...
currentTagStart = -1;
currentTagName = null;
}
else
{
// xml attribute
String attrName = consumeAttrName();
if (attrName == null)
{
// Oops, we found something quite unexpected in this
tag.
// The best we can do is probably to drop back to
looking
// for "/>", though that does risk us misinterpreting
the
// contents of an attribute's associated string value.
log.warn("Invalid tag found: unexpected input while
looking for attr name or '/>'"
+ " at line " + getCurrentLineNumber()+". "+
"Surroundings: '" + getTagSurroundings()
+"'.");
state = STATE_EXPECTING_ETAGO;
// and consume one character
++_offset;
}
else
{
consumeWhitespace();
// html can have "stand-alone" attributes with no
following equals sign
if (consumeMatch("="))
{
consumeAttrValue();
}
}
}
continue;
}
if (state == STATE_IN_MARKED_SECTION)
{
// in this state, nothing but "]]>" has any significance
consumeExcept("]");
if (isFinished())
{
break;
}
if (consumeMatch("]]>"))
{
state = STATE_READY;
}
else
{
// false call; ] is not end of cdata section
consumeMatch("]");
}
continue;
}
if (state == STATE_EXPECTING_ETAGO)
{
// The term "ETAGO" is the official spec term for "</".
consumeExcept("<");
if (isFinished())
{
log.debug("Malformed input page; input terminated while tag
not closed.");
break;
}
if (consumeMatch("</"))
{
// L. Ulrich, 23.09.2009:
// Workaround to skip other tags used within scripts:
// Test if the closed tag refers to currentEtagoTagName.
// Example:
// <script> function foo() { var str = "</body>"; ... }
</script>
// => do not tread </body> as the script closing tag
//
// Note that this will still not work as expected
// in case of recursive tags.
// Example:
// <script> function foo() { var str = "</script>"; ... }
</script>
CharSequence str = this._seq.subSequence(this._offset,
this._offset +
currentEtagoTagName.length());
if (str.toString().equals(currentEtagoTagName))
{
if (!processEndTag())
{
return;
}
state = STATE_READY;
currentEtagoTagName = null;
}
}
else
{
// false call; < does not start an ETAGO
consumeMatch("<");
}
continue;
}
}
}
--
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.