[netbeans] branch master updated: Stabelize html lexer in broken cases

matthiasblaesing Sun, 02 Apr 2023 08:28:13 -0700

This is an automated email from the ASF dual-hosted git repository.

matthiasblaesing pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/netbeans.git



The following commit(s) were added to refs/heads/master by this push:
     new 18c2e51daf Stabelize html lexer in broken cases
     new 5fc81d9194 Merge pull request #5756 from 
matthiasblaesing/stabilize_html_lexer
18c2e51daf is described below

commit 18c2e51daf88fe1cbc881aebbb601f938c8f3bea
Author: Matthias Bläsing <[email protected]>
AuthorDate: Fri Mar 31 21:16:59 2023 +0200

    Stabelize html lexer in broken cases
    
    Faced with a structure like this:
    
    "<Jm/  a=q</>>V  />G>"
    
    The html lexer hit an assertion as it tried to lex "a=q" as an
    attribute value. The sequence "<Jm/" at first looks like the start of
    a tag, but that is discarded once the "/" is read. Yet the lexer state
    is not reset to INIT, but ISP_TAG_X.
    
    Attribute lexing assumes, that a tag was lexed first and that that
    information is still present, which not the case here.
---
 .../src/org/netbeans/lib/html/lexer/HtmlLexer.java |   6 +-
 .../org/netbeans/lib/html/lexer/HtmlLexerTest.java | 121 +++++++++++++--------
 2 files changed, 77 insertions(+), 50 deletions(-)

diff --git a/ide/html.lexer/src/org/netbeans/lib/html/lexer/HtmlLexer.java 
b/ide/html.lexer/src/org/netbeans/lib/html/lexer/HtmlLexer.java
index 63b49d336a..5d14036d95 100644
--- a/ide/html.lexer/src/org/netbeans/lib/html/lexer/HtmlLexer.java
+++ b/ide/html.lexer/src/org/netbeans/lib/html/lexer/HtmlLexer.java
@@ -792,14 +792,14 @@ public final class HtmlLexer implements 
Lexer<HTMLTokenId> {
 
                 case ISP_TAG_X_ERROR:
                     if(isWS(actChar)) {
-                        lexerState = ISP_TAG_X;
+                        lexerState = tag == null ? INIT : ISP_TAG_X;
                         input.backup(1); //backup the WS
                         return token(HTMLTokenId.ERROR);
                     }
                     switch(actChar) {
                         case '/':
                         case '>':
-                            lexerState = ISP_TAG_X;
+                            lexerState = tag == null ? INIT : ISP_TAG_X;
                             input.backup(1); //lets reread the token again
                             return token(HTMLTokenId.ERROR);
                     }
@@ -822,7 +822,7 @@ public final class HtmlLexer implements Lexer<HTMLTokenId> {
                             lexerState = INIT;
                             return token(HTMLTokenId.TAG_CLOSE_SYMBOL);
                         default:
-                            lexerState = ISP_TAG_X;
+                            lexerState = tag == null ? INIT : ISP_TAG_X;
                             input.backup(1);
                             return token(HTMLTokenId.ERROR);
                     }
diff --git 
a/ide/html.lexer/test/unit/src/org/netbeans/lib/html/lexer/HtmlLexerTest.java 
b/ide/html.lexer/test/unit/src/org/netbeans/lib/html/lexer/HtmlLexerTest.java
index 23de053f36..22dca42754 100644
--- 
a/ide/html.lexer/test/unit/src/org/netbeans/lib/html/lexer/HtmlLexerTest.java
+++ 
b/ide/html.lexer/test/unit/src/org/netbeans/lib/html/lexer/HtmlLexerTest.java
@@ -375,52 +375,6 @@ public class HtmlLexerTest extends NbTestCase {
             "=|OPERATOR", "'test'|VALUE", " |WS", "/>|TAG_CLOSE_SYMBOL");
     }
 
-
-    
//--------------------------------------------------------------------------
-    
-    public static void checkTokens(String text, String... descriptions) {
-        TokenHierarchy<String> th = TokenHierarchy.create(text, 
HTMLTokenId.language());
-        TokenSequence<HTMLTokenId> ts = 
th.tokenSequence(HTMLTokenId.language());
-//        System.out.println(ts);
-        checkTokens(ts, descriptions);
-    }
-
-    public static void checkTokens(TokenSequence<HTMLTokenId> ts, String... 
descriptions) {
-        ts.moveStart();
-        for(String descr : descriptions) {
-            //parse description
-            int slashIndex = descr.indexOf('|');
-            assert slashIndex >= 0;
-
-            String image = descr.substring(0, slashIndex);
-            String id = descr.substring(slashIndex + 1);
-
-            assertTrue(ts.moveNext());
-            Token t = ts.token();
-            assertNotNull(t);
-
-            if(image.length() > 0) {
-                assertEquals(image, t.text().toString());
-            }
-
-            if(id.length() > 0) {
-                assertEquals(id, t.id().name());
-            }
-        }
-
-        StringBuilder b = new StringBuilder();
-        while(ts.moveNext()) {
-            Token t = ts.token();
-            b.append("\"");
-            b.append(t.text());
-            b.append('|');
-            b.append(t.id().name());
-            b.append("\"");
-            b.append(", ");
-        }
-        assertTrue("There are some tokens left: " + b.toString(), b.length() 
== 0);
-    }
-
      public void testScriptType_value() {
         TokenHierarchy th = TokenHierarchy.create("<script 
type=\"text/plain\">plain</script>", HTMLTokenId.language());
         TokenSequence ts = th.tokenSequence();
@@ -473,5 +427,78 @@ public class HtmlLexerTest extends NbTestCase {
         
         assertTrue("Couldn't find any SCRIPT token!", false);
     }
-    
+
+    public void testBrokenCases() {
+        // Two cases that were caught by randomness test and added here as
+        // regression check
+        assertLexesWithoutAssertion("u >z//<=>=>\n"
+                + "= \n"
+                + ">>>/>>//w>\n"
+                + " yl>be<<=k<uA>  F > Y <<<\n"
+                + " >Jj/>k >>  ==a <z<=z  ><=r>>> =N///>>>/><< \n"
+                + "EN>/>r> >p<<<L = > =<g=<C /=/   \n"
+                + " q=R>  >B >=>z= />> J o  </>/>zn/><<>   z/>/>G=>nm< ");
+        assertLexesWithoutAssertion("<Jm/  a=q</>>V  />G>< qw\n"
+                + " ></> /F>=< g>ggh i <>PoU =<  =eB< < / <<= >><==<>/> 
Q></>N=>/>=<>w/>=p  //>>sNk <<>=</> >\n"
+                + "==  <>Np >= <<</>U  yZT />=<e=<  >");
+    }
+
+    
//--------------------------------------------------------------------------
+
+    public static void checkTokens(String text, String... descriptions) {
+        TokenHierarchy<String> th = TokenHierarchy.create(text, 
HTMLTokenId.language());
+        TokenSequence<HTMLTokenId> ts = 
th.tokenSequence(HTMLTokenId.language());
+//        System.out.println(ts);
+        checkTokens(ts, descriptions);
+    }
+
+    public static void checkTokens(TokenSequence<HTMLTokenId> ts, String... 
descriptions) {
+        ts.moveStart();
+        for(String descr : descriptions) {
+            //parse description
+            int slashIndex = descr.indexOf('|');
+            assert slashIndex >= 0;
+
+            String image = descr.substring(0, slashIndex);
+            String id = descr.substring(slashIndex + 1);
+
+            assertTrue(ts.moveNext());
+            Token t = ts.token();
+            assertNotNull(t);
+
+            if(image.length() > 0) {
+                assertEquals(image, t.text().toString());
+            }
+
+            if(id.length() > 0) {
+                assertEquals(id, t.id().name());
+            }
+        }
+
+        StringBuilder b = new StringBuilder();
+        while(ts.moveNext()) {
+            Token t = ts.token();
+            b.append("\"");
+            b.append(t.text());
+            b.append('|');
+            b.append(t.id().name());
+            b.append("\"");
+            b.append(", ");
+        }
+        assertTrue("There are some tokens left: " + b.toString(), b.length() 
== 0);
+    }
+
+    @SuppressWarnings({"AssertWithSideEffects", "NestedAssignment"})
+    public void assertLexesWithoutAssertion(String input) {
+        boolean assertionsEnabled = false;
+        assert assertionsEnabled = true;
+        assertTrue("Test must be run with assertions enabled", 
assertionsEnabled);
+        TokenHierarchy<String> th = TokenHierarchy.create(input, 
HTMLTokenId.language());
+        TokenSequence<HTMLTokenId> ts = 
th.tokenSequence(HTMLTokenId.language());
+        // if there is no token sequence, lexing is obviously broken
+        assertNotNull(ts);
+        // iterate the full token sequence, lexing happens on demand, so the
+        // whole sequence needs to be read
+        while(ts.moveNext()) {};
     }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

For further information about the NetBeans mailing lists, visit:
https://cwiki.apache.org/confluence/display/NETBEANS/Mailing+lists

[netbeans] branch master updated: Stabelize html lexer in broken cases

Reply via email to