ReducedHTMLParser: incorrect assumption about STATE_EXPECTING_ETAGO
-------------------------------------------------------------------

                 Key: TOMAHAWK-1458
                 URL: https://issues.apache.org/jira/browse/TOMAHAWK-1458
             Project: MyFaces Tomahawk
          Issue Type: Bug
          Components: ExtensionsFilter
    Affects Versions: 1.1.9
            Reporter: Lutz Ulruch


ReducedHTMLParser assumes that <script> elements cannot contain "</" before 
</script>. This is not true.
Raw example:

<script>
  function foo() { var str = "</tr>; }"
</script>

In this case, ReducedHTMLParser switches to STATE_READY when "</tr>" is 
handled. But the <script> element is not closed here.
Below, I provide a patch for the parse() method. 
Note that this patch still works incorrectly if the script contains the string 
"</script>", like in

<script>
  function foo() { var str = "</script>; }"
</script>

Patch (my changes indicated by comments // L. Ulrich ...) 

    void parse()
    {
        int state = STATE_READY;

        int currentTagStart = -1;
        String currentTagName = null;

        _lineNumber = 1;
        _offset = 0;
        int lastOffset = _offset -1;
                
        // L. Ulrich, 23.09.2009:
        // New helper variable which holds the tag name 
        // in case of STATE_EXPECTING_ETAGO
        String currentEtagoTagName = null;
        
        while (_offset < _seq.length())
        {
            // Sanity check; each pass through this loop must increase the 
offset.
            // Failure to do this means a hang situation has occurred.
            if (_offset <= lastOffset)
            {
                // throw new RuntimeException("Infinite loop detected in 
ReducedHTMLParser");
                log.error("Infinite loop detected in ReducedHTMLParser; parsing 
skipped."+
                          " Surroundings: '" + getTagSurroundings() +"'.");
                //return;
            }
            lastOffset = _offset;

            if (state == STATE_READY)
            {
                // in this state, nothing but "<" has any significance
                consumeExcept("<");
                if (isFinished())
                {
                    break;
                }

                if (consumeMatch("<!--"))
                {
                    // Note that whitespace is *not* permitted in <!--
                    state = STATE_IN_COMMENT;
                }
                else if (consumeMatch("<!["))
                {
                    // Start of a "marked section", eg "<![CDATA" or
                    // "<![INCLUDE" or "<![IGNORE". These always terminate
                    // with "]]>"
                    log.debug("Marked section found at line " + 
getCurrentLineNumber()+". "+
                              "Surroundings: '" + getTagSurroundings() +"'.");
                    state = STATE_IN_MARKED_SECTION;
                }
                else if (consumeMatch("<!DOCTYPE"))
                {
                    log.debug("DOCTYPE found at line " + 
getCurrentLineNumber());
                    // we don't need to actually do anything here; the
                    // tag can't contain a bare "<", so the first "<"
                    // indicates the start of the next real tag.
                    //
                    // TODO: Handle case where the DOCTYPE includes an internal 
DTD. In
                    // that case there *will* be embedded < chars in the 
document. However
                    // that's very unlikely to be used in a JSF page, so this 
is pretty low
                    // priority.
                }
                else if (consumeMatch("<?"))
                {
                    // xml processing instruction or <!DOCTYPE> tag
                    // we don't need to actually do anything here; the
                    // tag can't contain a bare "<", so the first "<"
                    // indicates the start of the next real tag.
                    log.debug("PI found at line " + getCurrentLineNumber());
                }
                else if (consumeMatch("</"))
                {
                    if (!processEndTag())
                    {
                        // message already logged
                        return;
                    }

                    // stay in state READY
                    state = STATE_READY;
                }
                else if (consumeMatch("<"))
                {
                    // We can't tell the user that the tag has closed until 
after we have
                    // processed any attributes and found the real end of the 
tag. So save
                    // the current info until the end of this tag.
                    currentTagStart = _offset - 1;
                    currentTagName = consumeElementName();
                    if (currentTagName == null)
                    {
                        log.warn("Invalid HTML; bare lessthan sign found at 
line "
                                 + getCurrentLineNumber() + ". "+
                                 "Surroundings: '" + getTagSurroundings() 
+"'.");
                        // remain in STATE_READY; this isn't really the start of
                        // an xml element.
                    }
                    else
                    {
                        state = STATE_IN_TAG;
                    }
                }
                else
                {
                    // should never get here
                    throw new Error("Internal error at line " + 
getCurrentLineNumber());
                }

                continue;
            }

            if (state == STATE_IN_COMMENT)
            {
                // TODO: handle "--  >", which is a valid way to close a
                // comment according to the specs.

                // in this state, nothing but "--" has any significance
                consumeExcept("-");
                if (isFinished())
                {
                    break;
                }

                if (consumeMatch("-->"))
                {
                    state = STATE_READY;
                }
                else
                {
                    // false call; hyphen is not end of comment
                    consumeMatch("-");
                }

                continue;
            }

            if (state == STATE_IN_TAG)
            {
                consumeWhitespace();

                if (consumeMatch("/>"))
                {
                    // ok, end of element
                    state = STATE_READY;
                    closedTag(currentTagStart, _offset, currentTagName);

                    // and reset vars just in case...
                    currentTagStart = -1;
                    currentTagName = null;
                }
                else if (consumeMatch(">"))
                {
                    if (currentTagName.equalsIgnoreCase("script")
                        || currentTagName.equalsIgnoreCase("style"))
                    {
                        // We've just started a special tag which can contain 
anything except
                        // the ETAGO marker ("</"). See
                        // 
http://www.w3.org/TR/REC-html40/appendix/notes.html#notes-specifying-data
                        state = STATE_EXPECTING_ETAGO;
                        
                        // L. Ulrich, 23.09.2009:
                        // set currentEtagoTagName 
                        currentEtagoTagName = currentTagName;
                    }
                    else
                    {
                        state = STATE_READY;
                    }

                    // end of open tag, but not end of element
                    openedTag(currentTagStart, _offset, currentTagName);

                    // and reset vars just in case...
                    currentTagStart = -1;
                    currentTagName = null;
                }
                else
                {
                    // xml attribute
                    String attrName = consumeAttrName();
                    if (attrName == null)
                    {
                        // Oops, we found something quite unexpected in this 
tag.
                        // The best we can do is probably to drop back to 
looking
                        // for "/>", though that does risk us misinterpreting 
the
                        // contents of an attribute's associated string value.
                        log.warn("Invalid tag found: unexpected input while 
looking for attr name or '/>'"
                                 + " at line " + getCurrentLineNumber()+". "+
                                 "Surroundings: '" + getTagSurroundings() 
+"'.");
                        state = STATE_EXPECTING_ETAGO;
                        // and consume one character
                        ++_offset;
                    }
                    else
                    {
                        consumeWhitespace();

                        // html can have "stand-alone" attributes with no 
following equals sign
                        if (consumeMatch("="))
                        {
                            consumeAttrValue();
                        }
                    }
                }

                continue;
            }

            if (state == STATE_IN_MARKED_SECTION)
            {
                // in this state, nothing but "]]>" has any significance
                consumeExcept("]");
                if (isFinished())
                {
                    break;
                }

                if (consumeMatch("]]>"))
                {
                    state = STATE_READY;
                }
                else
                {
                    // false call; ] is not end of cdata section
                    consumeMatch("]");
                }

                continue;
            }

            if (state == STATE_EXPECTING_ETAGO)
            {
                // The term "ETAGO" is the official spec term for "</".
                consumeExcept("<");
                if (isFinished())
                {
                    log.debug("Malformed input page; input terminated while tag 
not closed.");
                    break;
                }

                if (consumeMatch("</"))
                {
                    // L. Ulrich, 23.09.2009:
                    // Workaround to skip other tags used within scripts:
                    // Test if the closed tag refers to currentEtagoTagName.
                    // Example:
                    // <script> function foo() { var str = "</body>"; ... } 
</script> 
                    // => do not tread </body> as the script closing tag
                    // 
                    // Note that this will still not work as expected
                    // in case of recursive tags.
                    // Example:
                    // <script> function foo() { var str = "</script>"; ... } 
</script>
                    CharSequence str = this._seq.subSequence(this._offset, 
                                                             this._offset + 
currentEtagoTagName.length());
                    if (str.toString().equals(currentEtagoTagName))
                    {
                        if (!processEndTag())
                        {
                            return;
                        }
                        state = STATE_READY;
                        currentEtagoTagName = null;
                    }
                }
                else
                {
                    // false call; < does not start an ETAGO
                    consumeMatch("<");
                }

                continue;
            }
        }
    }



-- 
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.

Reply via email to