Re: ReducedHTMLParser issues

Simon Kitching Mon, 31 Oct 2005 15:08:17 -0800

Martin Marinschek wrote:

Don't stress yourself - it's just the nightly build, so not to big of a problem.


Thanks, but it's hopefully done anyway.

changes:
 * Handle DOCTYPE and Processing Instruction commands in input HTML
 * Track line# of input for error messages
 * Remove some debugging printlns

I can also provide a patch soon to format the code to the MyFacesconvention rather than the Sun convention if you wish. Sorry, my Eclipseis set up to format stuff that way automatically and I forgot toreformat before posting.


Regards,

Simon

Index: ReducedHTMLParser.java
===================================================================
--- ReducedHTMLParser.java      (revision 329922)
+++ ReducedHTMLParser.java      (working copy)
@@ -49,6 +49,7 @@
     private static final int STATE_IN_TAG = 2;
     
     private int offset;
+    private int lineNumber;
     private CharSequence seq;
     private CallbackListener listener;
     
@@ -75,15 +76,32 @@
         return offset >= seq.length();
     }
 
+    int getCurrentLineNumber() {
+        return lineNumber;
+    }
+
     /**
      * Advance the current parse position over any whitespace characters.
      */
     void consumeWhitespace() {
+        boolean crSeen = false;
+
         while (offset < seq.length()) {
             char c = seq.charAt(offset);
             if (!Character.isWhitespace(c)) {
                 break;
             }
+            
+            // Track line number for error messages.
+            if (c == '\r') {
+                ++lineNumber;
+                crSeen = true;
+            } else if ((c == '\n') && !crSeen) {
+                ++lineNumber;
+            } else {
+                crSeen = false;
+            }
+
             ++offset;
         }
     }
@@ -193,6 +211,10 @@
         // TODO: should we consider a string to be terminated by a newline?
         // that would help with runaway strings but I think that multiline
         // strings *are* allowed...
+        //
+        // TODO: detect newlines within strings and increment lineNumber.
+        // This isn't so important, though; they aren't common and being a
+        // few lines out in an error message isn't serious either.
         StringBuffer stringBuf = new StringBuffer();
         boolean escaping = false;
         while (!isFinished()) {
@@ -248,6 +270,8 @@
      * @param s is a set of characters that should not be discarded.
      */
     void consumeExcept(String s) {
+        boolean crSeen = false;
+
         while (offset < seq.length()) {
             char c = seq.charAt(offset);
             if (s.indexOf(c) >= 0) {
@@ -255,6 +279,16 @@
                 return;
             }
             
+            // Track line number for error messages.
+            if (c == '\r') {
+                ++lineNumber;
+                crSeen = true;
+            } else if ((c == '\n') && !crSeen) {
+                ++lineNumber;
+            } else {
+                crSeen = false;
+            }
+
             ++offset;
         }
     }
@@ -269,6 +303,7 @@
         int currentTagStart = -1;
         String currentTagName = null;
         
+        lineNumber = 1;
         offset = 0;
         while (offset < seq.length())
         {
@@ -282,6 +317,10 @@
                 if (consumeMatch("<!--")) {
                     // VERIFY: can "< ! --" start a comment?
                     state = STATE_IN_COMMENT;
+                } else if (consumeMatch("<!")) {
+                    // xml processing instruction or <!DOCTYPE> tag
+                    // we don't need to actually do anything here
+                    log.debug("PI found at line " + getCurrentLineNumber());
                 } else if (consumeMatch("</")) {
                     // VERIFY: is "< / foo >" a valid end-tag?
 
@@ -306,10 +345,17 @@
                     // the current info until the end of this tag.
                     currentTagStart = offset - 1;
                     currentTagName = consumeElementName();
-                    state = STATE_IN_TAG;
+                    if (currentTagName == null) {
+                        log.warn("Invalid HTML; bare lessthan sign found at 
line "
+                            + getCurrentLineNumber());
+                        // remain in STATE_READY; this isn't really the start 
of
+                        // an xml element.
+                    } else {
+                        state = STATE_IN_TAG;
+                    }
                 } else {
                     // should never get here
-                    throw new Error("Internal error");
+                    throw new Error("Internal error at line " + 
getCurrentLineNumber());
                 }
                 
                 continue;
@@ -378,7 +424,6 @@
      */
     void openedTag(int startOffset, int endOffset, String tagName) {
         log.debug("Found open tag at " + startOffset + ":" + endOffset + ":" + 
tagName);
-        System.out.println("Found open tag at " + startOffset + ":" + 
endOffset + ":" + tagName);
         
         if ("head".equalsIgnoreCase(tagName)) {
             listener.openedStartTag(startOffset, HEAD_TAG);
@@ -394,7 +439,6 @@
 
     void closedTag(int startOffset, int endOffset, String tagName) {
         log.debug("Found close tag at " + startOffset + ":" + endOffset + ":" 
+ tagName);
-        System.out.println("Found close tag at " + startOffset + ":" + 
endOffset + ":" + tagName);
         
         if ("head".equalsIgnoreCase(tagName)) {
             listener.openedEndTag(startOffset, HEAD_TAG);

Index: ReducedHTMLParserTest.java
===================================================================
--- ReducedHTMLParserTest.java  (revision 329925)
+++ ReducedHTMLParserTest.java  (working copy)
@@ -322,8 +322,19 @@
         parser.consumeExcept("z");
     }
 
+    // test parsing completes when invalid tag found.
+    public void testParseBadTag() {
+        String s = "xxxx \n\n <# \n\n";
+        CallbackListener listener = new ParseCallbackListener();
+        ReducedHTMLParser parser = new ReducedHTMLParser(s, listener);
+        
+        parser.parse();
+        assertTrue(parser.isFinished());
+    }
+
     // test the full parse method
     public void testParse() {
+        String s0 = "<!DOCTYPE PUBLIC \"sss\" \"http:foo\">\n";
         String s1 = "<html><head>";
         String s2 = "\n<!-- a comment --><title>foo</title>";
         String s3 = "</head>";
@@ -338,6 +349,7 @@
         String s8 = "</body> </html>";
 
         StringBuffer buf = new StringBuffer();
+        buf.append(s0);
         buf.append(s1);
         buf.append(s2);
         buf.append(s3);
@@ -354,13 +366,13 @@
         
         // check that listener has correctly computed the offset to the char 
just
         // before the </head> tag starts.
-        int afterHeadPos = s1.length();
+        int afterHeadPos = s0.length() + s1.length();
         assertEquals("Pos after <head> tag ", afterHeadPos, 
listener.headerInsertPosition);
         
-        int beforeBodyPos = s1.length() + s2.length() + s3.length();
+        int beforeBodyPos = afterHeadPos + s2.length() + s3.length();
         assertEquals("Pos before <body> tag", beforeBodyPos, 
listener.beforeBodyPosition);
         
-        int afterBodyPos = s1.length() + s2.length() + s3.length() + 
s4.length();
+        int afterBodyPos = beforeBodyPos + s4.length();
         assertEquals("Pos after <body> tag", afterBodyPos, 
listener.bodyInsertPosition);
     }
 }

Re: ReducedHTMLParser issues

Reply via email to