[
https://issues.apache.org/jira/browse/TOMAHAWK-1458?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Lutz Ulruch updated TOMAHAWK-1458:
----------------------------------
Status: Open (was: Patch Available)
> ReducedHTMLParser: incorrect assumption about STATE_EXPECTING_ETAGO
> -------------------------------------------------------------------
>
> Key: TOMAHAWK-1458
> URL: https://issues.apache.org/jira/browse/TOMAHAWK-1458
> Project: MyFaces Tomahawk
> Issue Type: Bug
> Components: ExtensionsFilter
> Affects Versions: 1.1.9
> Reporter: Lutz Ulruch
>
> ReducedHTMLParser assumes that <script> elements cannot contain "</" before
> </script>. This is not true.
> Raw example:
> <script>
> function foo() { var str = "</tr>; }"
> </script>
> In this case, ReducedHTMLParser switches to STATE_READY when "</tr>" is
> handled. But the <script> element is not closed here.
> Below, I provide a patch for the parse() method.
> Note that this patch still works incorrectly if the script contains the
> string "</script>", like in
> <script>
> function foo() { var str = "</script>; }"
> </script>
> Patch (my changes indicated by comments // L. Ulrich ...)
> void parse()
> {
> int state = STATE_READY;
> int currentTagStart = -1;
> String currentTagName = null;
> _lineNumber = 1;
> _offset = 0;
> int lastOffset = _offset -1;
>
> // L. Ulrich, 23.09.2009:
> // New helper variable which holds the tag name
> // in case of STATE_EXPECTING_ETAGO
> String currentEtagoTagName = null;
>
> while (_offset < _seq.length())
> {
> // Sanity check; each pass through this loop must increase the
> offset.
> // Failure to do this means a hang situation has occurred.
> if (_offset <= lastOffset)
> {
> // throw new RuntimeException("Infinite loop detected in
> ReducedHTMLParser");
> log.error("Infinite loop detected in ReducedHTMLParser;
> parsing skipped."+
> " Surroundings: '" + getTagSurroundings() +"'.");
> //return;
> }
> lastOffset = _offset;
> if (state == STATE_READY)
> {
> // in this state, nothing but "<" has any significance
> consumeExcept("<");
> if (isFinished())
> {
> break;
> }
> if (consumeMatch("<!--"))
> {
> // Note that whitespace is *not* permitted in <!--
> state = STATE_IN_COMMENT;
> }
> else if (consumeMatch("<!["))
> {
> // Start of a "marked section", eg "<![CDATA" or
> // "<![INCLUDE" or "<![IGNORE". These always terminate
> // with "]]>"
> log.debug("Marked section found at line " +
> getCurrentLineNumber()+". "+
> "Surroundings: '" + getTagSurroundings() +"'.");
> state = STATE_IN_MARKED_SECTION;
> }
> else if (consumeMatch("<!DOCTYPE"))
> {
> log.debug("DOCTYPE found at line " +
> getCurrentLineNumber());
> // we don't need to actually do anything here; the
> // tag can't contain a bare "<", so the first "<"
> // indicates the start of the next real tag.
> //
> // TODO: Handle case where the DOCTYPE includes an
> internal DTD. In
> // that case there *will* be embedded < chars in the
> document. However
> // that's very unlikely to be used in a JSF page, so this
> is pretty low
> // priority.
> }
> else if (consumeMatch("<?"))
> {
> // xml processing instruction or <!DOCTYPE> tag
> // we don't need to actually do anything here; the
> // tag can't contain a bare "<", so the first "<"
> // indicates the start of the next real tag.
> log.debug("PI found at line " + getCurrentLineNumber());
> }
> else if (consumeMatch("</"))
> {
> if (!processEndTag())
> {
> // message already logged
> return;
> }
> // stay in state READY
> state = STATE_READY;
> }
> else if (consumeMatch("<"))
> {
> // We can't tell the user that the tag has closed until
> after we have
> // processed any attributes and found the real end of the
> tag. So save
> // the current info until the end of this tag.
> currentTagStart = _offset - 1;
> currentTagName = consumeElementName();
> if (currentTagName == null)
> {
> log.warn("Invalid HTML; bare lessthan sign found at
> line "
> + getCurrentLineNumber() + ". "+
> "Surroundings: '" + getTagSurroundings()
> +"'.");
> // remain in STATE_READY; this isn't really the start
> of
> // an xml element.
> }
> else
> {
> state = STATE_IN_TAG;
> }
> }
> else
> {
> // should never get here
> throw new Error("Internal error at line " +
> getCurrentLineNumber());
> }
> continue;
> }
> if (state == STATE_IN_COMMENT)
> {
> // TODO: handle "-- >", which is a valid way to close a
> // comment according to the specs.
> // in this state, nothing but "--" has any significance
> consumeExcept("-");
> if (isFinished())
> {
> break;
> }
> if (consumeMatch("-->"))
> {
> state = STATE_READY;
> }
> else
> {
> // false call; hyphen is not end of comment
> consumeMatch("-");
> }
> continue;
> }
> if (state == STATE_IN_TAG)
> {
> consumeWhitespace();
> if (consumeMatch("/>"))
> {
> // ok, end of element
> state = STATE_READY;
> closedTag(currentTagStart, _offset, currentTagName);
> // and reset vars just in case...
> currentTagStart = -1;
> currentTagName = null;
> }
> else if (consumeMatch(">"))
> {
> if (currentTagName.equalsIgnoreCase("script")
> || currentTagName.equalsIgnoreCase("style"))
> {
> // We've just started a special tag which can contain
> anything except
> // the ETAGO marker ("</"). See
> //
> http://www.w3.org/TR/REC-html40/appendix/notes.html#notes-specifying-data
> state = STATE_EXPECTING_ETAGO;
>
> // L. Ulrich, 23.09.2009:
> // set currentEtagoTagName
> currentEtagoTagName = currentTagName;
> }
> else
> {
> state = STATE_READY;
> }
> // end of open tag, but not end of element
> openedTag(currentTagStart, _offset, currentTagName);
> // and reset vars just in case...
> currentTagStart = -1;
> currentTagName = null;
> }
> else
> {
> // xml attribute
> String attrName = consumeAttrName();
> if (attrName == null)
> {
> // Oops, we found something quite unexpected in this
> tag.
> // The best we can do is probably to drop back to
> looking
> // for "/>", though that does risk us misinterpreting
> the
> // contents of an attribute's associated string value.
> log.warn("Invalid tag found: unexpected input while
> looking for attr name or '/>'"
> + " at line " + getCurrentLineNumber()+". "+
> "Surroundings: '" + getTagSurroundings()
> +"'.");
> state = STATE_EXPECTING_ETAGO;
> // and consume one character
> ++_offset;
> }
> else
> {
> consumeWhitespace();
> // html can have "stand-alone" attributes with no
> following equals sign
> if (consumeMatch("="))
> {
> consumeAttrValue();
> }
> }
> }
> continue;
> }
> if (state == STATE_IN_MARKED_SECTION)
> {
> // in this state, nothing but "]]>" has any significance
> consumeExcept("]");
> if (isFinished())
> {
> break;
> }
> if (consumeMatch("]]>"))
> {
> state = STATE_READY;
> }
> else
> {
> // false call; ] is not end of cdata section
> consumeMatch("]");
> }
> continue;
> }
> if (state == STATE_EXPECTING_ETAGO)
> {
> // The term "ETAGO" is the official spec term for "</".
> consumeExcept("<");
> if (isFinished())
> {
> log.debug("Malformed input page; input terminated while
> tag not closed.");
> break;
> }
> if (consumeMatch("</"))
> {
> // L. Ulrich, 23.09.2009:
> // Workaround to skip other tags used within scripts:
> // Test if the closed tag refers to currentEtagoTagName.
> // Example:
> // <script> function foo() { var str = "</body>"; ... }
> </script>
> // => do not tread </body> as the script closing tag
> //
> // Note that this will still not work as expected
> // in case of recursive tags.
> // Example:
> // <script> function foo() { var str = "</script>"; ... }
> </script>
> CharSequence str = this._seq.subSequence(this._offset,
> this._offset +
> currentEtagoTagName.length());
> if (str.toString().equals(currentEtagoTagName))
> {
> if (!processEndTag())
> {
> return;
> }
> state = STATE_READY;
> currentEtagoTagName = null;
> }
> }
> else
> {
> // false call; < does not start an ETAGO
> consumeMatch("<");
> }
> continue;
> }
> }
> }
--
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.