Martin Marinschek wrote:
Don't stress yourself - it's just the nightly build, so not to big of a problem.

Thanks, but it's hopefully done anyway.

changes:
 * Handle DOCTYPE and Processing Instruction commands in input HTML
 * Track line# of input for error messages
 * Remove some debugging printlns

I can also provide a patch soon to format the code to the MyFaces convention rather than the Sun convention if you wish. Sorry, my Eclipse is set up to format stuff that way automatically and I forgot to reformat before posting.

Regards,

Simon
Index: ReducedHTMLParser.java
===================================================================
--- ReducedHTMLParser.java      (revision 329922)
+++ ReducedHTMLParser.java      (working copy)
@@ -49,6 +49,7 @@
     private static final int STATE_IN_TAG = 2;
     
     private int offset;
+    private int lineNumber;
     private CharSequence seq;
     private CallbackListener listener;
     
@@ -75,15 +76,32 @@
         return offset >= seq.length();
     }
 
+    int getCurrentLineNumber() {
+        return lineNumber;
+    }
+
     /**
      * Advance the current parse position over any whitespace characters.
      */
     void consumeWhitespace() {
+        boolean crSeen = false;
+
         while (offset < seq.length()) {
             char c = seq.charAt(offset);
             if (!Character.isWhitespace(c)) {
                 break;
             }
+            
+            // Track line number for error messages.
+            if (c == '\r') {
+                ++lineNumber;
+                crSeen = true;
+            } else if ((c == '\n') && !crSeen) {
+                ++lineNumber;
+            } else {
+                crSeen = false;
+            }
+
             ++offset;
         }
     }
@@ -193,6 +211,10 @@
         // TODO: should we consider a string to be terminated by a newline?
         // that would help with runaway strings but I think that multiline
         // strings *are* allowed...
+        //
+        // TODO: detect newlines within strings and increment lineNumber.
+        // This isn't so important, though; they aren't common and being a
+        // few lines out in an error message isn't serious either.
         StringBuffer stringBuf = new StringBuffer();
         boolean escaping = false;
         while (!isFinished()) {
@@ -248,6 +270,8 @@
      * @param s is a set of characters that should not be discarded.
      */
     void consumeExcept(String s) {
+        boolean crSeen = false;
+
         while (offset < seq.length()) {
             char c = seq.charAt(offset);
             if (s.indexOf(c) >= 0) {
@@ -255,6 +279,16 @@
                 return;
             }
             
+            // Track line number for error messages.
+            if (c == '\r') {
+                ++lineNumber;
+                crSeen = true;
+            } else if ((c == '\n') && !crSeen) {
+                ++lineNumber;
+            } else {
+                crSeen = false;
+            }
+
             ++offset;
         }
     }
@@ -269,6 +303,7 @@
         int currentTagStart = -1;
         String currentTagName = null;
         
+        lineNumber = 1;
         offset = 0;
         while (offset < seq.length())
         {
@@ -282,6 +317,10 @@
                 if (consumeMatch("<!--")) {
                     // VERIFY: can "< ! --" start a comment?
                     state = STATE_IN_COMMENT;
+                } else if (consumeMatch("<!")) {
+                    // xml processing instruction or <!DOCTYPE> tag
+                    // we don't need to actually do anything here
+                    log.debug("PI found at line " + getCurrentLineNumber());
                 } else if (consumeMatch("</")) {
                     // VERIFY: is "< / foo >" a valid end-tag?
 
@@ -306,10 +345,17 @@
                     // the current info until the end of this tag.
                     currentTagStart = offset - 1;
                     currentTagName = consumeElementName();
-                    state = STATE_IN_TAG;
+                    if (currentTagName == null) {
+                        log.warn("Invalid HTML; bare lessthan sign found at 
line "
+                            + getCurrentLineNumber());
+                        // remain in STATE_READY; this isn't really the start 
of
+                        // an xml element.
+                    } else {
+                        state = STATE_IN_TAG;
+                    }
                 } else {
                     // should never get here
-                    throw new Error("Internal error");
+                    throw new Error("Internal error at line " + 
getCurrentLineNumber());
                 }
                 
                 continue;
@@ -378,7 +424,6 @@
      */
     void openedTag(int startOffset, int endOffset, String tagName) {
         log.debug("Found open tag at " + startOffset + ":" + endOffset + ":" + 
tagName);
-        System.out.println("Found open tag at " + startOffset + ":" + 
endOffset + ":" + tagName);
         
         if ("head".equalsIgnoreCase(tagName)) {
             listener.openedStartTag(startOffset, HEAD_TAG);
@@ -394,7 +439,6 @@
 
     void closedTag(int startOffset, int endOffset, String tagName) {
         log.debug("Found close tag at " + startOffset + ":" + endOffset + ":" 
+ tagName);
-        System.out.println("Found close tag at " + startOffset + ":" + 
endOffset + ":" + tagName);
         
         if ("head".equalsIgnoreCase(tagName)) {
             listener.openedEndTag(startOffset, HEAD_TAG);
Index: ReducedHTMLParserTest.java
===================================================================
--- ReducedHTMLParserTest.java  (revision 329925)
+++ ReducedHTMLParserTest.java  (working copy)
@@ -322,8 +322,19 @@
         parser.consumeExcept("z");
     }
 
+    // test parsing completes when invalid tag found.
+    public void testParseBadTag() {
+        String s = "xxxx \n\n <# \n\n";
+        CallbackListener listener = new ParseCallbackListener();
+        ReducedHTMLParser parser = new ReducedHTMLParser(s, listener);
+        
+        parser.parse();
+        assertTrue(parser.isFinished());
+    }
+
     // test the full parse method
     public void testParse() {
+        String s0 = "<!DOCTYPE PUBLIC \"sss\" \"http:foo\">\n";
         String s1 = "<html><head>";
         String s2 = "\n<!-- a comment --><title>foo</title>";
         String s3 = "</head>";
@@ -338,6 +349,7 @@
         String s8 = "</body> </html>";
 
         StringBuffer buf = new StringBuffer();
+        buf.append(s0);
         buf.append(s1);
         buf.append(s2);
         buf.append(s3);
@@ -354,13 +366,13 @@
         
         // check that listener has correctly computed the offset to the char 
just
         // before the </head> tag starts.
-        int afterHeadPos = s1.length();
+        int afterHeadPos = s0.length() + s1.length();
         assertEquals("Pos after <head> tag ", afterHeadPos, 
listener.headerInsertPosition);
         
-        int beforeBodyPos = s1.length() + s2.length() + s3.length();
+        int beforeBodyPos = afterHeadPos + s2.length() + s3.length();
         assertEquals("Pos before <body> tag", beforeBodyPos, 
listener.beforeBodyPosition);
         
-        int afterBodyPos = s1.length() + s2.length() + s3.length() + 
s4.length();
+        int afterBodyPos = beforeBodyPos + s4.length();
         assertEquals("Pos after <body> tag", afterBodyPos, 
listener.bodyInsertPosition);
     }
 }

Reply via email to