jenkins-bot has submitted this change and it was merged.

Change subject: Improve broken attribute heuristics
......................................................................


Improve broken attribute heuristics

 * Now that these rules are clearly marked for tables, we see where
   breaking on ! and | makes sense.

 * The blacklist change is from permitting | generic attribute names
   which seems correct. In I941c595ef150666ea4535289d16e86c425c24389
   however, the tags that we're permitting should be restricted to a
   single line. Maybe an inline break on \n is called for.

Change-Id: I8307b61bc784d28f7ce386152a500d6b78d36d87
---
M lib/pegTokenizer.pegjs.txt
M tests/parserTests-blacklist.js
2 files changed, 30 insertions(+), 40 deletions(-)

Approvals:
  Subramanya Sastry: Looks good to me, approved
  jenkins-bot: Verified

Objections:
  Arlolra: There's a problem with this change, please improve



diff --git a/lib/pegTokenizer.pegjs.txt b/lib/pegTokenizer.pegjs.txt
index 6d7626f..4c19304 100644
--- a/lib/pegTokenizer.pegjs.txt
+++ b/lib/pegTokenizer.pegjs.txt
@@ -1202,7 +1202,7 @@
     }
 
 could_be_attribute =
-    // quick sanity check before expensive attribute_preprocessor_text_line
+    // quick sanity check before expensive table_attribute_preprocessor_text
     // rule. Also try to parse on [|!+;] for now which seem to be common
     // syntax errors in production that hidden by the PHP parser (by stripping
     // the 'attributes').
@@ -1284,7 +1284,7 @@
 // The arrangement of chars is to emphasize the split between what's disallowed
 // by html5 and what's necessary to give directive a chance.
 generic_attribute_name
-  = r:( $[^ \t\0\n\r/=>"'!<&\[\]|{}\-]+
+  = r:( $[^ \t\0\n\r/=>"'<&\[\]{}\-]+
         / ! inline_breaks
           ! '/>'
           // /=>"' is the html5 attribute name set we do not want.
@@ -1298,7 +1298,7 @@
 // Same as generic_attribute_name, except for accepting tags found here.
 // That doesn't make sense (ie. match php) in the generic case.
 table_attribute_name
-  = r:( $[^ \t\0\n\r/=>"'!<&\[\]|{}\-]+
+  = r:( $[^ \t\0\n\r/=>"'<&\[\]{}\-!|]+
         / ! inline_breaks
           ! '/>'
           // /=>"' is the html5 attribute name set we do not want.
@@ -1332,8 +1332,8 @@
       t1:attribute_preprocessor_text_single? "'" {
         return tu.getAttributeValueAndSource(input, t1, startOffset(), 
endOffset() - 1);
       }
-      // Missing end quote: accept | and > look-ahead as heuristic
-      / t2:attribute_preprocessor_text_single_broken? &[|>] {
+      // Missing end quote: accept > look-ahead as heuristic
+      / t2:attribute_preprocessor_text_single_broken? &'>' {
         return tu.getAttributeValueAndSource(input, t2, startOffset(), 
endOffset());
       }
     ) { return r; }
@@ -1341,36 +1341,36 @@
       t1:attribute_preprocessor_text_double? '"' {
         return tu.getAttributeValueAndSource(input, t1, startOffset(), 
endOffset() - 1);
       }
-      // Missing end quote: accept | and > look-ahead as heuristic
-      / t2:attribute_preprocessor_text_double_broken? &[|>] {
+      // Missing end quote: accept > look-ahead as heuristic
+      / t2:attribute_preprocessor_text_double_broken? &'>' {
         return tu.getAttributeValueAndSource(input, t2, startOffset(), 
endOffset());
       }
     ) { return r; }
-  / s:$space_or_newline* t:attribute_preprocessor_text !"=" {
+  / s:$space_or_newline* t:attribute_preprocessor_text !'=' {
       return tu.getAttributeValueAndSource(input, t, startOffset() + s.length, 
endOffset());
     }
 
 // Attribute value, restricted to a single line.
 table_att_value
   = space* "'" r:(
-      t1:attribute_preprocessor_text_single_line?  "'" {
+      t1:table_attribute_preprocessor_text_single?  "'" {
         return tu.getAttributeValueAndSource(input, t1, startOffset(), 
endOffset() - 1);
       }
-      // Missing end quote: accept | and > look-ahead as heuristic
-      / t2:attribute_preprocessor_text_single_line_broken? &[|>\n] {
+      // Missing end quote: accept |, !, and \n look-ahead as heuristic
+      / t2:table_attribute_preprocessor_text_single_broken? &[|!\n] {
         return tu.getAttributeValueAndSource(input, t2, startOffset(), 
endOffset());
       }
     ) { return r; }
   / space* '"' r:(
-      t1:attribute_preprocessor_text_double_line? '"' {
+      t1:table_attribute_preprocessor_text_double? '"' {
         return tu.getAttributeValueAndSource(input, t1, startOffset(), 
endOffset() - 1);
       }
-      // Missing end quote: accept | and > look-ahead as heuristic
-      / t2:attribute_preprocessor_text_double_line_broken? &[|>\n] {
+      // Missing end quote: accept |, !, and \n look-ahead as heuristic
+      / t2:table_attribute_preprocessor_text_double_broken? &[|!\n] {
         return tu.getAttributeValueAndSource(input, t2, startOffset(), 
endOffset());
       }
     ) { return r; }
-  / s:$space* t:attribute_preprocessor_text_line !"=" {
+  / s:$space* t:table_attribute_preprocessor_text !'=' {
       return tu.getAttributeValueAndSource(input, t, startOffset() + s.length, 
endOffset());
     }
 
@@ -2040,7 +2040,7 @@
   }
 
 attribute_preprocessor_text_single
-  = r:( $[^{}&'<\-]+
+  = r:( $[^{}&<\-']+
   / !inline_breaks s:(
       directive
     / [{}&<\-] ) { return s; }
@@ -2050,7 +2050,7 @@
   }
 
 attribute_preprocessor_text_single_broken
-  = r:( $[^{}&'<>|\-]+
+  = r:( $[^{}&<\-'>]+
   / !inline_breaks s:(
       directive
     / [{}&<\-] ) { return s; }
@@ -2060,7 +2060,7 @@
   }
 
 attribute_preprocessor_text_double
-  = r:( $[^{}&"<\-]+
+  = r:( $[^{}&<\-"]+
   / !inline_breaks s:(
       directive
     / [{}&<\-] ) { return s; }
@@ -2070,7 +2070,7 @@
   }
 
 attribute_preprocessor_text_double_broken
-  = r:( $[^{}&"<>|\-]+
+  = r:( $[^{}&<\-">]+
   / !inline_breaks s:(
       directive
     / [{}&<\-] ) { return s; }
@@ -2080,8 +2080,8 @@
   }
 
 // Variants with the entire attribute on a single line
-attribute_preprocessor_text_line
-  = r:( $[^=<>\n\r&'"\t \[\]|{}/!\-]+
+table_attribute_preprocessor_text
+  = r:( $[^=<>\n\r&'"\t \[\]{}/\-|!]+
         / !inline_breaks
           !'/>'
           t:( directive
@@ -2089,29 +2089,29 @@
           ) { return t; }
     )+ { return tu.flattenString(r); }
 
-attribute_preprocessor_text_single_line
-  = r:( $[^{}&'<\n\-]+
+table_attribute_preprocessor_text_single
+  = r:( $[^{}&<\-'\n]+
   / !inline_breaks s:( directive / $[{}&<\-] ) { return s; }
   )* {
       return tu.flattenString(r);
   }
 
-attribute_preprocessor_text_single_line_broken
-  = r:( $[^{}&'<>|!\n\-]+
+table_attribute_preprocessor_text_single_broken
+  = r:( $[^{}&<\-'\n|!]+
   / !inline_breaks s:( directive / $[{}&<\-] ) { return s; }
   )* {
       return tu.flattenString(r);
   }
 
-attribute_preprocessor_text_double_line
-  = r:( $[^{}&"<\n\-]+
+table_attribute_preprocessor_text_double
+  = r:( $[^{}&<\-"\n]+
   / !inline_breaks s:( directive / $[{}&<\-] ) { return s; }
   )* {
       return tu.flattenString(r);
   }
 
-attribute_preprocessor_text_double_line_broken
-  = r:( $[^{}&"<>|!\n\-]+
+table_attribute_preprocessor_text_double_broken
+  = r:( $[^{}&<\-"\n|!]+
   / !inline_breaks s:( directive / $[{}&<\-] ) { return s; }
   )* {
       return tu.flattenString(r);
diff --git a/tests/parserTests-blacklist.js b/tests/parserTests-blacklist.js
index 5802270..930a510 100644
--- a/tests/parserTests-blacklist.js
+++ b/tests/parserTests-blacklist.js
@@ -184,7 +184,7 @@
 add("wt2html", "Fuzz testing: Parser14", "<h2 
data-parsoid='{\"dsr\":[0,18,2,2]}'> onmouseover= </h2>\n<p 
data-parsoid='{\"dsr\":[19,33,0,0]}'><a rel=\"mw:ExtLink\" 
href=\"http://__TOC__\"; 
data-parsoid='{\"stx\":\"url\",\"dsr\":[19,33,0,0]}'>http://__TOC__</a></p>");
 add("wt2html", "Fuzz testing: Parser14-table", "<h2 
data-parsoid='{\"dsr\":[0,5,2,2]}'>a</h2>\n<table style=\"__TOC__\" 
data-parsoid='{\"autoInsertedEnd\":true,\"dsr\":[6,22,16,0]}'></table>");
 add("wt2html", "Fuzz testing: Parser24", "<p 
data-parsoid='{\"fostered\":true,\"autoInsertedEnd\":true,\"dsr\":[0,0]}'><span 
typeof=\"mw:Nowiki\" 
data-parsoid='{\"src\":\"{{{\",\"dsr\":[3,6,0,0]}'>{{{</span>|\n<u class=\"|\" 
about=\"#mwt2\" typeof=\"mw:ExpandedAttrs\" 
data-parsoid='{\"stx\":\"html\",\"a\":{\"{{{{SSSll!!!!!!!VVVV)]]][[Special:*xxxxxxx-->&lt;noinclude>}}}}\":null},\"sa\":{\"{{{{SSSll!!!!!!!VVVV)]]][[Special:*xxxxxxx-->&lt;noinclude>}}}}\":\"\"},\"autoInsertedEnd\":true,\"dsr\":[8,0,74,0]}'
 
data-mw='{\"attribs\":[[{\"txt\":\"{{{{SSSll!!!!!!!VVVV)]]][[Special:*xxxxxxx-->}}}}\",\"html\":\"&lt;span
 about=\\\"#mwt1\\\" typeof=\\\"mw:Param\\\" 
data-parsoid=\\\"{&amp;quot;dsr&amp;quot;:[20,79,null,null],&amp;quot;src&amp;quot;:&amp;quot;{{{{SSSll!!!!!!!VVVV)]]][[Special:*xxxxxxx-->&lt;noinclude>}}}&amp;quot;}\\\">{{{{SSSll!!!!!!!VVVV)]]][[Special:*xxxxxxx--&amp;gt;}}}&lt;/span>}\"},{\"html\":\"\"}]]}'>\n<br
 style=\"onmouseover='alert(document.cookie);' \" 
data-parsoid='{\"stx\":\"html\",\"selfClose\":true,\"dsr\":[83,0,53,0]}'/></u></p><p
 data-parsoid='{\"fostered\":true,\"autoInsertedEnd\":true,\"dsr\":[0,0]}'><u 
class=\"|\" about=\"#mwt2\" typeof=\"mw:ExpandedAttrs\" 
data-parsoid='{\"stx\":\"html\",\"a\":{\"{{{{SSSll!!!!!!!VVVV)]]][[Special:*xxxxxxx-->&lt;noinclude>}}}}\":null},\"sa\":{\"{{{{SSSll!!!!!!!VVVV)]]][[Special:*xxxxxxx-->&lt;noinclude>}}}}\":\"\"},\"autoInsertedEnd\":true,\"autoInsertedStart\":true,\"dsr\":[-37,0,0,0]}'
 
data-mw='{\"attribs\":[[{\"txt\":\"{{{{SSSll!!!!!!!VVVV)]]][[Special:*xxxxxxx-->}}}}\",\"html\":\"&lt;span
 about=\\\"#mwt1\\\" typeof=\\\"mw:Param\\\" 
data-parsoid=\\\"{&amp;quot;dsr&amp;quot;:[20,79,null,null],&amp;quot;src&amp;quot;:&amp;quot;{{{{SSSll!!!!!!!VVVV)]]][[Special:*xxxxxxx-->&lt;noinclude>}}}&amp;quot;}\\\">{{{{SSSll!!!!!!!VVVV)]]][[Special:*xxxxxxx--&amp;gt;}}}&lt;/span>}\"},{\"html\":\"\"}]]}'>MOVE
 YOUR MOUSE CURSOR OVER THIS TEXT</u></p><table 
data-parsoid='{\"autoInsertedEnd\":true,\"dsr\":[0,177,2,0]}'>\n\n\n\n<tbody 
data-parsoid='{\"dsr\":[176,177,0,0]}'><tr 
data-parsoid='{\"autoInsertedEnd\":true,\"autoInsertedStart\":true,\"dsr\":[176,177,0,0]}'><td
 
data-parsoid='{\"autoInsertedEnd\":true,\"dsr\":[176,177,1,0]}'></td></tr></tbody></table>");
-add("wt2html", "Fuzz testing: Parser25 (bug 6055)", "<span about=\"#mwt1\" 
typeof=\"mw:Param\" data-parsoid='{\"dsr\":[0,26,null,null],\"src\":\"{{{\\n| 
\\n&lt;LI CLASS=||\\n >\\n}}}\"}'></span><p 
data-parsoid='{\"dsr\":[26,110,0,0]}'>blah\" onmouseover=\"alert('hello 
world');\" align=\"left\"<b 
data-parsoid='{\"autoInsertedEnd\":true,\"dsr\":[80,110,3,0]}'>MOVE MOUSE 
CURSOR OVER HERE</b></p>");
+add("wt2html", "Fuzz testing: Parser25 (bug 6055)", "<li class=\"\" 
about=\"#mwt1\" typeof=\"mw:Param\" 
data-parsoid='{\"stx\":\"html\",\"srcTagName\":\"LI\",\"a\":{\"||\":null},\"sa\":{\"||\":\"\"},\"autoInsertedEnd\":true,\"dsr\":[0,110,15,0],\"src\":\"{{{\\n|
 \\n&lt;LI CLASS=||\\n >\\n}}}blah\\\" onmouseover=\\\"alert(&#39;hello 
world&#39;);\\\" align=\\\"left\\\"&#39;&#39;&#39;MOVE MOUSE CURSOR OVER 
HERE\"}'>\n<p data-parsoid='{\"dsr\":[26,110,0,0]}'>blah\" 
onmouseover=\"alert('hello world');\" align=\"left\"<b 
data-parsoid='{\"autoInsertedEnd\":true,\"dsr\":[80,110,3,0]}'>MOVE MOUSE 
CURSOR OVER HERE</b></p></li>");
 add("wt2html", "Inline HTML vs wiki block nesting", "<p 
data-parsoid='{\"dsr\":[0,17,0,0]}'><b 
data-parsoid='{\"stx\":\"html\",\"autoInsertedEnd\":true,\"dsr\":[0,17,3,0]}'>Bold
 paragraph</b></p><b 
data-parsoid='{\"stx\":\"html\",\"autoInsertedEnd\":true,\"autoInsertedStart\":true,\"dsr\":[17,37,0,0]}'>\n\n<p
 data-parsoid='{\"dsr\":[19,37,0,0]}'>New wiki paragraph</p></b>");
 add("wt2html", "Special page transclusion", "<p about=\"#mwt1\" 
typeof=\"mw:Transclusion\" data-parsoid='{\"dsr\":[0,30,0,0],\"pi\":[[]]}' 
data-mw='{\"parts\":[{\"template\":{\"target\":{\"wt\":\"Special:Prefixindex/Xyzzyx\",\"function\":\"special\"},\"params\":{},\"i\":0}}]}'>Parser
 function implementation for pf_special missing in Parsoid.</p>");
 add("wt2html", "Special page transclusion twice (bug 5021)", "<p 
data-parsoid='{\"dsr\":[0,61,0,0]}'><span about=\"#mwt1\" 
typeof=\"mw:Transclusion\" 
data-parsoid='{\"pi\":[[]],\"dsr\":[0,30,null,null]}' 
data-mw='{\"parts\":[{\"template\":{\"target\":{\"wt\":\"Special:Prefixindex/Xyzzyx\",\"function\":\"special\"},\"params\":{},\"i\":0}}]}'>Parser
 function implementation for pf_special missing in Parsoid.</span>\n<span 
about=\"#mwt2\" typeof=\"mw:Transclusion\" 
data-parsoid='{\"pi\":[[]],\"dsr\":[31,61,null,null]}' 
data-mw='{\"parts\":[{\"template\":{\"target\":{\"wt\":\"Special:Prefixindex/Xyzzyx\",\"function\":\"special\"},\"params\":{},\"i\":0}}]}'>Parser
 function implementation for pf_special missing in Parsoid.</span></p>");
@@ -397,7 +397,6 @@
 add("wt2wt", "Fuzz testing: Parser21", "{|\n! irc://{{ftp://a\"; 
onmouseover=\"alert('hello world');\"\n|\n|}");
 add("wt2wt", "Fuzz testing: Parser22", "http://===r:::https://b\n\n{|\n|}");
 add("wt2wt", "Fuzz testing: Parser24", "{{{|\n<u class=\"|\" 
{{{{SSSll!!!!!!!VVVV)]]][[Special:*xxxxxxx--><noinclude>}}}}>\n<br 
style=\"onmouseover='alert(document.cookie);' \" />\n\nMOVE YOUR MOUSE CURSOR 
OVER THIS TEXT\n{|\n\n|\n|}");
-add("wt2wt", "Fuzz testing: Parser25 (bug 6055)", "{{{\n| \n<LI CLASS=||\n 
>\n}}}blah\" onmouseover=\"alert('hello world');\" align=\"left\"'''MOVE MOUSE 
CURSOR OVER HERE'''\n");
 add("wt2wt", "Inline wiki vs wiki block nesting", "'''Bold paragraph'''\n\nNew 
wiki paragraph\n");
 add("wt2wt", "Mixing markup for italics and bold", 
"'<nowiki/>''bold'<nowiki/>'''''bold''bolditalics'''''\n");
 add("wt2wt", "ISBN code coverage", "ISBN 978-0-1234-56&#x20;789\n");
@@ -2157,15 +2156,6 @@
 add("selser", "Fuzz testing: Parser24 [[1,2,0],0,4]", 
"{{{sk3rl6utosgojemi|\n<u class=\"|\" 
{{{{SSSll!!!!!!!VVVV)]]][[Special:*xxxxxxx--><noinclude>}}}}>\n<br 
style=\"onmouseover='alert(document.cookie);' \" />\n\n\n\njzny1t7i47mygb9\n");
 add("selser", "Fuzz testing: Parser24 [[4,3,0],3,1]", "ajjrvymgwee45cdi<u 
class=\"|\" {{{{SSSll!!!!!!!VVVV)]]][[Special:*xxxxxxx--><noinclude>}}}}>\n<br 
style=\"onmouseover='alert(document.cookie);' \" />\n{| 
data-foobar=\"q3u36zzq1lg3z0k9\"\n\n||}");
 add("selser", "Fuzz testing: Parser24 [1,1,2]", "{{{|\n<u class=\"|\" 
{{{{SSSll!!!!!!!VVVV)]]][[Special:*xxxxxxx--><noinclude>}}}}>\n<br 
style=\"onmouseover='alert(document.cookie);' \" />\n\nMOVE YOUR MOUSE CURSOR 
OVER THIS TEXT\n\nwo1ljwyi3jlzbyb9\n{|\n{{{|\n<u CLASS=\n| 
{{{{SSSll!!!!!!!VVVV)]]][[Special:*xxxxxxx--><noinclude>}}}} >\n<br 
style=\"onmouseover='alert(document.cookie);' \" />\n\nMOVE YOUR MOUSE CURSOR 
OVER THIS TEXT\n|");
-add("selser", "Fuzz testing: Parser25 (bug 6055) [0,2]", "{{{\n| \n<LI 
CLASS=||\n >\n}}}zstgbb6nw6nr8uxr\n\nblah\" onmouseover=\"alert('hello 
world');\" align=\"left\"'''MOVE MOUSE CURSOR OVER HERE");
-add("selser", "Fuzz testing: Parser25 (bug 6055) [0,1]", "{{{\n| \n<LI 
CLASS=||\n >\n}}}blah\" onmouseover=\"alert('hello world');\" 
align=\"left\"'''MOVE MOUSE CURSOR OVER HERE");
-add("selser", "Fuzz testing: Parser25 (bug 6055) [0,[2,2]]", "{{{\n| \n<LI 
CLASS=||\n >\n}}}4rrl98ny5zgaxlxrblah\" onmouseover=\"alert('hello world');\" 
align=\"left\"8i02syp6jkbj4i'''MOVE MOUSE CURSOR OVER HERE");
-add("selser", "Fuzz testing: Parser25 (bug 6055) [0,[3,0]]", "{{{\n| \n<LI 
CLASS=||\n >\n}}}'''MOVE MOUSE CURSOR OVER HERE");
-add("selser", "Fuzz testing: Parser25 (bug 6055) [0,[0,2]]", "{{{\n| \n<LI 
CLASS=||\n >\n}}}blah\" onmouseover=\"alert('hello world');\" 
align=\"left\"1y1dswlzouu40a4i'''MOVE MOUSE CURSOR OVER HERE");
-add("selser", "Fuzz testing: Parser25 (bug 6055) [0,[3,2]]", "{{{\n| \n<LI 
CLASS=||\n >\n}}}4kaxzxt0lciejyvi'''MOVE MOUSE CURSOR OVER HERE");
-add("selser", "Fuzz testing: Parser25 (bug 6055) [0,[4,2]]", "{{{\n| \n<LI 
CLASS=||\n >\n}}}xfilnfj1id89qkt9ugf2zkqtlng3c8fr'''MOVE MOUSE CURSOR OVER 
HERE");
-add("selser", "Fuzz testing: Parser25 (bug 6055) [0,[4,0]]", "{{{\n| \n<LI 
CLASS=||\n >\n}}}l2alwbk8ifab57b9'''MOVE MOUSE CURSOR OVER HERE");
-add("selser", "Fuzz testing: Parser25 (bug 6055) [0,[2,0]]", "{{{\n| \n<LI 
CLASS=||\n >\n}}}ltl4kyjnfvoq1tt9blah\" onmouseover=\"alert('hello world');\" 
align=\"left\"'''MOVE MOUSE CURSOR OVER HERE");
 add("selser", "Inline wiki vs wiki block nesting [2,0,1]", 
"y31p21pdfplgcik9\n\n'''Bold paragraph\n\nNew wiki paragraph");
 add("selser", "Inline wiki vs wiki block nesting [1,3,[3]]", "'''Bold 
paragraph\n");
 add("selser", "Inline wiki vs wiki block nesting [0,0,4]", "'''Bold 
paragraph\n\n1vaanv1vlmtb7qfr\n");

-- 
To view, visit https://gerrit.wikimedia.org/r/230270
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I8307b61bc784d28f7ce386152a500d6b78d36d87
Gerrit-PatchSet: 8
Gerrit-Project: mediawiki/services/parsoid
Gerrit-Branch: master
Gerrit-Owner: Arlolra <[email protected]>
Gerrit-Reviewer: Arlolra <[email protected]>
Gerrit-Reviewer: Cscott <[email protected]>
Gerrit-Reviewer: Subramanya Sastry <[email protected]>
Gerrit-Reviewer: Tim Starling <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to