Re: ReducedHTMLParser issues

Martin Marinschek Tue, 01 Nov 2005 08:14:47 -0800

Ok,

even though the patch didn't work - I applied it line by line for the
parser as I needed this thing working right now.


Can you redo the test patch in any case?

regards,

Martin

On 11/1/05, Martin Marinschek <[EMAIL PROTECTED]> wrote:
> Simon,
>
> I don't seem to be able to apply your patch again - an 'unknown line
> type was found in line 12'.
>
> Can you do it again - and attach it to our old jira-issue, I have
> reopened it for this purpose.
>
> regards,
>
> Martin
>
> On 11/1/05, Simon Kitching <[EMAIL PROTECTED]> wrote:
> > Martin Marinschek wrote:
> > > Don't stress yourself - it's just the nightly build, so not to big of a 
> > > problem.
> >
> > Thanks, but it's hopefully done anyway.
> >
> > changes:
> >   * Handle DOCTYPE and Processing Instruction commands in input HTML
> >   * Track line# of input for error messages
> >   * Remove some debugging printlns
> >
> > I can also provide a patch soon to format the code to the MyFaces
> > convention rather than the Sun convention if you wish. Sorry, my Eclipse
> > is set up to format stuff that way automatically and I forgot to
> > reformat before posting.
> >
> > Regards,
> >
> > Simon
> >
> >
> > Index: ReducedHTMLParser.java
> > ===================================================================
> > --- ReducedHTMLParser.java      (revision 329922)
> > +++ ReducedHTMLParser.java      (working copy)
> > @@ -49,6 +49,7 @@
> >      private static final int STATE_IN_TAG = 2;
> >
> >      private int offset;
> > +    private int lineNumber;
> >      private CharSequence seq;
> >      private CallbackListener listener;
> >
> > @@ -75,15 +76,32 @@
> >          return offset >= seq.length();
> >      }
> >
> > +    int getCurrentLineNumber() {
> > +        return lineNumber;
> > +    }
> > +
> >      /**
> >       * Advance the current parse position over any whitespace characters.
> >       */
> >      void consumeWhitespace() {
> > +        boolean crSeen = false;
> > +
> >          while (offset < seq.length()) {
> >              char c = seq.charAt(offset);
> >              if (!Character.isWhitespace(c)) {
> >                  break;
> >              }
> > +
> > +            // Track line number for error messages.
> > +            if (c == '\r') {
> > +                ++lineNumber;
> > +                crSeen = true;
> > +            } else if ((c == '\n') && !crSeen) {
> > +                ++lineNumber;
> > +            } else {
> > +                crSeen = false;
> > +            }
> > +
> >              ++offset;
> >          }
> >      }
> > @@ -193,6 +211,10 @@
> >          // TODO: should we consider a string to be terminated by a newline?
> >          // that would help with runaway strings but I think that multiline
> >          // strings *are* allowed...
> > +        //
> > +        // TODO: detect newlines within strings and increment lineNumber.
> > +        // This isn't so important, though; they aren't common and being a
> > +        // few lines out in an error message isn't serious either.
> >          StringBuffer stringBuf = new StringBuffer();
> >          boolean escaping = false;
> >          while (!isFinished()) {
> > @@ -248,6 +270,8 @@
> >       * @param s is a set of characters that should not be discarded.
> >       */
> >      void consumeExcept(String s) {
> > +        boolean crSeen = false;
> > +
> >          while (offset < seq.length()) {
> >              char c = seq.charAt(offset);
> >              if (s.indexOf(c) >= 0) {
> > @@ -255,6 +279,16 @@
> >                  return;
> >              }
> >
> > +            // Track line number for error messages.
> > +            if (c == '\r') {
> > +                ++lineNumber;
> > +                crSeen = true;
> > +            } else if ((c == '\n') && !crSeen) {
> > +                ++lineNumber;
> > +            } else {
> > +                crSeen = false;
> > +            }
> > +
> >              ++offset;
> >          }
> >      }
> > @@ -269,6 +303,7 @@
> >          int currentTagStart = -1;
> >          String currentTagName = null;
> >
> > +        lineNumber = 1;
> >          offset = 0;
> >          while (offset < seq.length())
> >          {
> > @@ -282,6 +317,10 @@
> >                  if (consumeMatch("<!--")) {
> >                      // VERIFY: can "< ! --" start a comment?
> >                      state = STATE_IN_COMMENT;
> > +                } else if (consumeMatch("<!")) {
> > +                    // xml processing instruction or <!DOCTYPE> tag
> > +                    // we don't need to actually do anything here
> > +                    log.debug("PI found at line " + 
> > getCurrentLineNumber());
> >                  } else if (consumeMatch("</")) {
> >                      // VERIFY: is "< / foo >" a valid end-tag?
> >
> > @@ -306,10 +345,17 @@
> >                      // the current info until the end of this tag.
> >                      currentTagStart = offset - 1;
> >                      currentTagName = consumeElementName();
> > -                    state = STATE_IN_TAG;
> > +                    if (currentTagName == null) {
> > +                        log.warn("Invalid HTML; bare lessthan sign found 
> > at line "
> > +                            + getCurrentLineNumber());
> > +                        // remain in STATE_READY; this isn't really the 
> > start of
> > +                        // an xml element.
> > +                    } else {
> > +                        state = STATE_IN_TAG;
> > +                    }
> >                  } else {
> >                      // should never get here
> > -                    throw new Error("Internal error");
> > +                    throw new Error("Internal error at line " + 
> > getCurrentLineNumber());
> >                  }
> >
> >                  continue;
> > @@ -378,7 +424,6 @@
> >       */
> >      void openedTag(int startOffset, int endOffset, String tagName) {
> >          log.debug("Found open tag at " + startOffset + ":" + endOffset + 
> > ":" + tagName);
> > -        System.out.println("Found open tag at " + startOffset + ":" + 
> > endOffset + ":" + tagName);
> >
> >          if ("head".equalsIgnoreCase(tagName)) {
> >              listener.openedStartTag(startOffset, HEAD_TAG);
> > @@ -394,7 +439,6 @@
> >
> >      void closedTag(int startOffset, int endOffset, String tagName) {
> >          log.debug("Found close tag at " + startOffset + ":" + endOffset + 
> > ":" + tagName);
> > -        System.out.println("Found close tag at " + startOffset + ":" + 
> > endOffset + ":" + tagName);
> >
> >          if ("head".equalsIgnoreCase(tagName)) {
> >              listener.openedEndTag(startOffset, HEAD_TAG);
> >
> >
> > Index: ReducedHTMLParserTest.java
> > ===================================================================
> > --- ReducedHTMLParserTest.java  (revision 329925)
> > +++ ReducedHTMLParserTest.java  (working copy)
> > @@ -322,8 +322,19 @@
> >          parser.consumeExcept("z");
> >      }
> >
> > +    // test parsing completes when invalid tag found.
> > +    public void testParseBadTag() {
> > +        String s = "xxxx \n\n <# \n\n";
> > +        CallbackListener listener = new ParseCallbackListener();
> > +        ReducedHTMLParser parser = new ReducedHTMLParser(s, listener);
> > +
> > +        parser.parse();
> > +        assertTrue(parser.isFinished());
> > +    }
> > +
> >      // test the full parse method
> >      public void testParse() {
> > +        String s0 = "<!DOCTYPE PUBLIC \"sss\" \"http:foo\">\n";
> >          String s1 = "<html><head>";
> >          String s2 = "\n<!-- a comment --><title>foo</title>";
> >          String s3 = "</head>";
> > @@ -338,6 +349,7 @@
> >          String s8 = "</body> </html>";
> >
> >          StringBuffer buf = new StringBuffer();
> > +        buf.append(s0);
> >          buf.append(s1);
> >          buf.append(s2);
> >          buf.append(s3);
> > @@ -354,13 +366,13 @@
> >
> >          // check that listener has correctly computed the offset to the 
> > char just
> >          // before the </head> tag starts.
> > -        int afterHeadPos = s1.length();
> > +        int afterHeadPos = s0.length() + s1.length();
> >          assertEquals("Pos after <head> tag ", afterHeadPos, 
> > listener.headerInsertPosition);
> >
> > -        int beforeBodyPos = s1.length() + s2.length() + s3.length();
> > +        int beforeBodyPos = afterHeadPos + s2.length() + s3.length();
> >          assertEquals("Pos before <body> tag", beforeBodyPos, 
> > listener.beforeBodyPosition);
> >
> > -        int afterBodyPos = s1.length() + s2.length() + s3.length() + 
> > s4.length();
> > +        int afterBodyPos = beforeBodyPos + s4.length();
> >          assertEquals("Pos after <body> tag", afterBodyPos, 
> > listener.bodyInsertPosition);
> >      }
> >  }
> >
> >
> >
>
>
> --
>
> http://www.irian.at
> Your JSF powerhouse -
> JSF Trainings in English and German
>


--

http://www.irian.at
Your JSF powerhouse -
JSF Trainings in English and German

Re: ReducedHTMLParser issues

Reply via email to