jenkins-bot has submitted this change and it was merged.
Change subject: Improve broken attribute heuristics
......................................................................
Improve broken attribute heuristics
* Now that these rules are clearly marked for tables, we see where
breaking on ! and | makes sense.
* The blacklist change is from permitting | generic attribute names
which seems correct. In I941c595ef150666ea4535289d16e86c425c24389
however, the tags that we're permitting should be restricted to a
single line. Maybe an inline break on \n is called for.
Change-Id: I8307b61bc784d28f7ce386152a500d6b78d36d87
---
M lib/pegTokenizer.pegjs.txt
M tests/parserTests-blacklist.js
2 files changed, 30 insertions(+), 40 deletions(-)
Approvals:
Subramanya Sastry: Looks good to me, approved
jenkins-bot: Verified
Objections:
Arlolra: There's a problem with this change, please improve
diff --git a/lib/pegTokenizer.pegjs.txt b/lib/pegTokenizer.pegjs.txt
index 6d7626f..4c19304 100644
--- a/lib/pegTokenizer.pegjs.txt
+++ b/lib/pegTokenizer.pegjs.txt
@@ -1202,7 +1202,7 @@
}
could_be_attribute =
- // quick sanity check before expensive attribute_preprocessor_text_line
+ // quick sanity check before expensive table_attribute_preprocessor_text
// rule. Also try to parse on [|!+;] for now which seem to be common
// syntax errors in production that hidden by the PHP parser (by stripping
// the 'attributes').
@@ -1284,7 +1284,7 @@
// The arrangement of chars is to emphasize the split between what's disallowed
// by html5 and what's necessary to give directive a chance.
generic_attribute_name
- = r:( $[^ \t\0\n\r/=>"'!<&\[\]|{}\-]+
+ = r:( $[^ \t\0\n\r/=>"'<&\[\]{}\-]+
/ ! inline_breaks
! '/>'
// /=>"' is the html5 attribute name set we do not want.
@@ -1298,7 +1298,7 @@
// Same as generic_attribute_name, except for accepting tags found here.
// That doesn't make sense (ie. match php) in the generic case.
table_attribute_name
- = r:( $[^ \t\0\n\r/=>"'!<&\[\]|{}\-]+
+ = r:( $[^ \t\0\n\r/=>"'<&\[\]{}\-!|]+
/ ! inline_breaks
! '/>'
// /=>"' is the html5 attribute name set we do not want.
@@ -1332,8 +1332,8 @@
t1:attribute_preprocessor_text_single? "'" {
return tu.getAttributeValueAndSource(input, t1, startOffset(),
endOffset() - 1);
}
- // Missing end quote: accept | and > look-ahead as heuristic
- / t2:attribute_preprocessor_text_single_broken? &[|>] {
+ // Missing end quote: accept > look-ahead as heuristic
+ / t2:attribute_preprocessor_text_single_broken? &'>' {
return tu.getAttributeValueAndSource(input, t2, startOffset(),
endOffset());
}
) { return r; }
@@ -1341,36 +1341,36 @@
t1:attribute_preprocessor_text_double? '"' {
return tu.getAttributeValueAndSource(input, t1, startOffset(),
endOffset() - 1);
}
- // Missing end quote: accept | and > look-ahead as heuristic
- / t2:attribute_preprocessor_text_double_broken? &[|>] {
+ // Missing end quote: accept > look-ahead as heuristic
+ / t2:attribute_preprocessor_text_double_broken? &'>' {
return tu.getAttributeValueAndSource(input, t2, startOffset(),
endOffset());
}
) { return r; }
- / s:$space_or_newline* t:attribute_preprocessor_text !"=" {
+ / s:$space_or_newline* t:attribute_preprocessor_text !'=' {
return tu.getAttributeValueAndSource(input, t, startOffset() + s.length,
endOffset());
}
// Attribute value, restricted to a single line.
table_att_value
= space* "'" r:(
- t1:attribute_preprocessor_text_single_line? "'" {
+ t1:table_attribute_preprocessor_text_single? "'" {
return tu.getAttributeValueAndSource(input, t1, startOffset(),
endOffset() - 1);
}
- // Missing end quote: accept | and > look-ahead as heuristic
- / t2:attribute_preprocessor_text_single_line_broken? &[|>\n] {
+ // Missing end quote: accept |, !, and \n look-ahead as heuristic
+ / t2:table_attribute_preprocessor_text_single_broken? &[|!\n] {
return tu.getAttributeValueAndSource(input, t2, startOffset(),
endOffset());
}
) { return r; }
/ space* '"' r:(
- t1:attribute_preprocessor_text_double_line? '"' {
+ t1:table_attribute_preprocessor_text_double? '"' {
return tu.getAttributeValueAndSource(input, t1, startOffset(),
endOffset() - 1);
}
- // Missing end quote: accept | and > look-ahead as heuristic
- / t2:attribute_preprocessor_text_double_line_broken? &[|>\n] {
+ // Missing end quote: accept |, !, and \n look-ahead as heuristic
+ / t2:table_attribute_preprocessor_text_double_broken? &[|!\n] {
return tu.getAttributeValueAndSource(input, t2, startOffset(),
endOffset());
}
) { return r; }
- / s:$space* t:attribute_preprocessor_text_line !"=" {
+ / s:$space* t:table_attribute_preprocessor_text !'=' {
return tu.getAttributeValueAndSource(input, t, startOffset() + s.length,
endOffset());
}
@@ -2040,7 +2040,7 @@
}
attribute_preprocessor_text_single
- = r:( $[^{}&'<\-]+
+ = r:( $[^{}&<\-']+
/ !inline_breaks s:(
directive
/ [{}&<\-] ) { return s; }
@@ -2050,7 +2050,7 @@
}
attribute_preprocessor_text_single_broken
- = r:( $[^{}&'<>|\-]+
+ = r:( $[^{}&<\-'>]+
/ !inline_breaks s:(
directive
/ [{}&<\-] ) { return s; }
@@ -2060,7 +2060,7 @@
}
attribute_preprocessor_text_double
- = r:( $[^{}&"<\-]+
+ = r:( $[^{}&<\-"]+
/ !inline_breaks s:(
directive
/ [{}&<\-] ) { return s; }
@@ -2070,7 +2070,7 @@
}
attribute_preprocessor_text_double_broken
- = r:( $[^{}&"<>|\-]+
+ = r:( $[^{}&<\-">]+
/ !inline_breaks s:(
directive
/ [{}&<\-] ) { return s; }
@@ -2080,8 +2080,8 @@
}
// Variants with the entire attribute on a single line
-attribute_preprocessor_text_line
- = r:( $[^=<>\n\r&'"\t \[\]|{}/!\-]+
+table_attribute_preprocessor_text
+ = r:( $[^=<>\n\r&'"\t \[\]{}/\-|!]+
/ !inline_breaks
!'/>'
t:( directive
@@ -2089,29 +2089,29 @@
) { return t; }
)+ { return tu.flattenString(r); }
-attribute_preprocessor_text_single_line
- = r:( $[^{}&'<\n\-]+
+table_attribute_preprocessor_text_single
+ = r:( $[^{}&<\-'\n]+
/ !inline_breaks s:( directive / $[{}&<\-] ) { return s; }
)* {
return tu.flattenString(r);
}
-attribute_preprocessor_text_single_line_broken
- = r:( $[^{}&'<>|!\n\-]+
+table_attribute_preprocessor_text_single_broken
+ = r:( $[^{}&<\-'\n|!]+
/ !inline_breaks s:( directive / $[{}&<\-] ) { return s; }
)* {
return tu.flattenString(r);
}
-attribute_preprocessor_text_double_line
- = r:( $[^{}&"<\n\-]+
+table_attribute_preprocessor_text_double
+ = r:( $[^{}&<\-"\n]+
/ !inline_breaks s:( directive / $[{}&<\-] ) { return s; }
)* {
return tu.flattenString(r);
}
-attribute_preprocessor_text_double_line_broken
- = r:( $[^{}&"<>|!\n\-]+
+table_attribute_preprocessor_text_double_broken
+ = r:( $[^{}&<\-"\n|!]+
/ !inline_breaks s:( directive / $[{}&<\-] ) { return s; }
)* {
return tu.flattenString(r);
diff --git a/tests/parserTests-blacklist.js b/tests/parserTests-blacklist.js
index 5802270..930a510 100644
--- a/tests/parserTests-blacklist.js
+++ b/tests/parserTests-blacklist.js
@@ -184,7 +184,7 @@
add("wt2html", "Fuzz testing: Parser14", "<h2
data-parsoid='{\"dsr\":[0,18,2,2]}'> onmouseover= </h2>\n<p
data-parsoid='{\"dsr\":[19,33,0,0]}'><a rel=\"mw:ExtLink\"
href=\"http://__TOC__\"
data-parsoid='{\"stx\":\"url\",\"dsr\":[19,33,0,0]}'>http://__TOC__</a></p>");
add("wt2html", "Fuzz testing: Parser14-table", "<h2
data-parsoid='{\"dsr\":[0,5,2,2]}'>a</h2>\n<table style=\"__TOC__\"
data-parsoid='{\"autoInsertedEnd\":true,\"dsr\":[6,22,16,0]}'></table>");
add("wt2html", "Fuzz testing: Parser24", "<p
data-parsoid='{\"fostered\":true,\"autoInsertedEnd\":true,\"dsr\":[0,0]}'><span
typeof=\"mw:Nowiki\"
data-parsoid='{\"src\":\"{{{\",\"dsr\":[3,6,0,0]}'>{{{</span>|\n<u class=\"|\"
about=\"#mwt2\" typeof=\"mw:ExpandedAttrs\"
data-parsoid='{\"stx\":\"html\",\"a\":{\"{{{{SSSll!!!!!!!VVVV)]]][[Special:*xxxxxxx--><noinclude>}}}}\":null},\"sa\":{\"{{{{SSSll!!!!!!!VVVV)]]][[Special:*xxxxxxx--><noinclude>}}}}\":\"\"},\"autoInsertedEnd\":true,\"dsr\":[8,0,74,0]}'
data-mw='{\"attribs\":[[{\"txt\":\"{{{{SSSll!!!!!!!VVVV)]]][[Special:*xxxxxxx-->}}}}\",\"html\":\"<span
about=\\\"#mwt1\\\" typeof=\\\"mw:Param\\\"
data-parsoid=\\\"{&quot;dsr&quot;:[20,79,null,null],&quot;src&quot;:&quot;{{{{SSSll!!!!!!!VVVV)]]][[Special:*xxxxxxx--><noinclude>}}}&quot;}\\\">{{{{SSSll!!!!!!!VVVV)]]][[Special:*xxxxxxx--&gt;}}}</span>}\"},{\"html\":\"\"}]]}'>\n<br
style=\"onmouseover='alert(document.cookie);' \"
data-parsoid='{\"stx\":\"html\",\"selfClose\":true,\"dsr\":[83,0,53,0]}'/></u></p><p
data-parsoid='{\"fostered\":true,\"autoInsertedEnd\":true,\"dsr\":[0,0]}'><u
class=\"|\" about=\"#mwt2\" typeof=\"mw:ExpandedAttrs\"
data-parsoid='{\"stx\":\"html\",\"a\":{\"{{{{SSSll!!!!!!!VVVV)]]][[Special:*xxxxxxx--><noinclude>}}}}\":null},\"sa\":{\"{{{{SSSll!!!!!!!VVVV)]]][[Special:*xxxxxxx--><noinclude>}}}}\":\"\"},\"autoInsertedEnd\":true,\"autoInsertedStart\":true,\"dsr\":[-37,0,0,0]}'
data-mw='{\"attribs\":[[{\"txt\":\"{{{{SSSll!!!!!!!VVVV)]]][[Special:*xxxxxxx-->}}}}\",\"html\":\"<span
about=\\\"#mwt1\\\" typeof=\\\"mw:Param\\\"
data-parsoid=\\\"{&quot;dsr&quot;:[20,79,null,null],&quot;src&quot;:&quot;{{{{SSSll!!!!!!!VVVV)]]][[Special:*xxxxxxx--><noinclude>}}}&quot;}\\\">{{{{SSSll!!!!!!!VVVV)]]][[Special:*xxxxxxx--&gt;}}}</span>}\"},{\"html\":\"\"}]]}'>MOVE
YOUR MOUSE CURSOR OVER THIS TEXT</u></p><table
data-parsoid='{\"autoInsertedEnd\":true,\"dsr\":[0,177,2,0]}'>\n\n\n\n<tbody
data-parsoid='{\"dsr\":[176,177,0,0]}'><tr
data-parsoid='{\"autoInsertedEnd\":true,\"autoInsertedStart\":true,\"dsr\":[176,177,0,0]}'><td
data-parsoid='{\"autoInsertedEnd\":true,\"dsr\":[176,177,1,0]}'></td></tr></tbody></table>");
-add("wt2html", "Fuzz testing: Parser25 (bug 6055)", "<span about=\"#mwt1\"
typeof=\"mw:Param\" data-parsoid='{\"dsr\":[0,26,null,null],\"src\":\"{{{\\n|
\\n<LI CLASS=||\\n >\\n}}}\"}'></span><p
data-parsoid='{\"dsr\":[26,110,0,0]}'>blah\" onmouseover=\"alert('hello
world');\" align=\"left\"<b
data-parsoid='{\"autoInsertedEnd\":true,\"dsr\":[80,110,3,0]}'>MOVE MOUSE
CURSOR OVER HERE</b></p>");
+add("wt2html", "Fuzz testing: Parser25 (bug 6055)", "<li class=\"\"
about=\"#mwt1\" typeof=\"mw:Param\"
data-parsoid='{\"stx\":\"html\",\"srcTagName\":\"LI\",\"a\":{\"||\":null},\"sa\":{\"||\":\"\"},\"autoInsertedEnd\":true,\"dsr\":[0,110,15,0],\"src\":\"{{{\\n|
\\n<LI CLASS=||\\n >\\n}}}blah\\\" onmouseover=\\\"alert('hello
world');\\\" align=\\\"left\\\"'''MOVE MOUSE CURSOR OVER
HERE\"}'>\n<p data-parsoid='{\"dsr\":[26,110,0,0]}'>blah\"
onmouseover=\"alert('hello world');\" align=\"left\"<b
data-parsoid='{\"autoInsertedEnd\":true,\"dsr\":[80,110,3,0]}'>MOVE MOUSE
CURSOR OVER HERE</b></p></li>");
add("wt2html", "Inline HTML vs wiki block nesting", "<p
data-parsoid='{\"dsr\":[0,17,0,0]}'><b
data-parsoid='{\"stx\":\"html\",\"autoInsertedEnd\":true,\"dsr\":[0,17,3,0]}'>Bold
paragraph</b></p><b
data-parsoid='{\"stx\":\"html\",\"autoInsertedEnd\":true,\"autoInsertedStart\":true,\"dsr\":[17,37,0,0]}'>\n\n<p
data-parsoid='{\"dsr\":[19,37,0,0]}'>New wiki paragraph</p></b>");
add("wt2html", "Special page transclusion", "<p about=\"#mwt1\"
typeof=\"mw:Transclusion\" data-parsoid='{\"dsr\":[0,30,0,0],\"pi\":[[]]}'
data-mw='{\"parts\":[{\"template\":{\"target\":{\"wt\":\"Special:Prefixindex/Xyzzyx\",\"function\":\"special\"},\"params\":{},\"i\":0}}]}'>Parser
function implementation for pf_special missing in Parsoid.</p>");
add("wt2html", "Special page transclusion twice (bug 5021)", "<p
data-parsoid='{\"dsr\":[0,61,0,0]}'><span about=\"#mwt1\"
typeof=\"mw:Transclusion\"
data-parsoid='{\"pi\":[[]],\"dsr\":[0,30,null,null]}'
data-mw='{\"parts\":[{\"template\":{\"target\":{\"wt\":\"Special:Prefixindex/Xyzzyx\",\"function\":\"special\"},\"params\":{},\"i\":0}}]}'>Parser
function implementation for pf_special missing in Parsoid.</span>\n<span
about=\"#mwt2\" typeof=\"mw:Transclusion\"
data-parsoid='{\"pi\":[[]],\"dsr\":[31,61,null,null]}'
data-mw='{\"parts\":[{\"template\":{\"target\":{\"wt\":\"Special:Prefixindex/Xyzzyx\",\"function\":\"special\"},\"params\":{},\"i\":0}}]}'>Parser
function implementation for pf_special missing in Parsoid.</span></p>");
@@ -397,7 +397,6 @@
add("wt2wt", "Fuzz testing: Parser21", "{|\n! irc://{{ftp://a\"
onmouseover=\"alert('hello world');\"\n|\n|}");
add("wt2wt", "Fuzz testing: Parser22", "http://===r:::https://b\n\n{|\n|}");
add("wt2wt", "Fuzz testing: Parser24", "{{{|\n<u class=\"|\"
{{{{SSSll!!!!!!!VVVV)]]][[Special:*xxxxxxx--><noinclude>}}}}>\n<br
style=\"onmouseover='alert(document.cookie);' \" />\n\nMOVE YOUR MOUSE CURSOR
OVER THIS TEXT\n{|\n\n|\n|}");
-add("wt2wt", "Fuzz testing: Parser25 (bug 6055)", "{{{\n| \n<LI CLASS=||\n
>\n}}}blah\" onmouseover=\"alert('hello world');\" align=\"left\"'''MOVE MOUSE
CURSOR OVER HERE'''\n");
add("wt2wt", "Inline wiki vs wiki block nesting", "'''Bold paragraph'''\n\nNew
wiki paragraph\n");
add("wt2wt", "Mixing markup for italics and bold",
"'<nowiki/>''bold'<nowiki/>'''''bold''bolditalics'''''\n");
add("wt2wt", "ISBN code coverage", "ISBN 978-0-1234-56 789\n");
@@ -2157,15 +2156,6 @@
add("selser", "Fuzz testing: Parser24 [[1,2,0],0,4]",
"{{{sk3rl6utosgojemi|\n<u class=\"|\"
{{{{SSSll!!!!!!!VVVV)]]][[Special:*xxxxxxx--><noinclude>}}}}>\n<br
style=\"onmouseover='alert(document.cookie);' \" />\n\n\n\njzny1t7i47mygb9\n");
add("selser", "Fuzz testing: Parser24 [[4,3,0],3,1]", "ajjrvymgwee45cdi<u
class=\"|\" {{{{SSSll!!!!!!!VVVV)]]][[Special:*xxxxxxx--><noinclude>}}}}>\n<br
style=\"onmouseover='alert(document.cookie);' \" />\n{|
data-foobar=\"q3u36zzq1lg3z0k9\"\n\n||}");
add("selser", "Fuzz testing: Parser24 [1,1,2]", "{{{|\n<u class=\"|\"
{{{{SSSll!!!!!!!VVVV)]]][[Special:*xxxxxxx--><noinclude>}}}}>\n<br
style=\"onmouseover='alert(document.cookie);' \" />\n\nMOVE YOUR MOUSE CURSOR
OVER THIS TEXT\n\nwo1ljwyi3jlzbyb9\n{|\n{{{|\n<u CLASS=\n|
{{{{SSSll!!!!!!!VVVV)]]][[Special:*xxxxxxx--><noinclude>}}}} >\n<br
style=\"onmouseover='alert(document.cookie);' \" />\n\nMOVE YOUR MOUSE CURSOR
OVER THIS TEXT\n|");
-add("selser", "Fuzz testing: Parser25 (bug 6055) [0,2]", "{{{\n| \n<LI
CLASS=||\n >\n}}}zstgbb6nw6nr8uxr\n\nblah\" onmouseover=\"alert('hello
world');\" align=\"left\"'''MOVE MOUSE CURSOR OVER HERE");
-add("selser", "Fuzz testing: Parser25 (bug 6055) [0,1]", "{{{\n| \n<LI
CLASS=||\n >\n}}}blah\" onmouseover=\"alert('hello world');\"
align=\"left\"'''MOVE MOUSE CURSOR OVER HERE");
-add("selser", "Fuzz testing: Parser25 (bug 6055) [0,[2,2]]", "{{{\n| \n<LI
CLASS=||\n >\n}}}4rrl98ny5zgaxlxrblah\" onmouseover=\"alert('hello world');\"
align=\"left\"8i02syp6jkbj4i'''MOVE MOUSE CURSOR OVER HERE");
-add("selser", "Fuzz testing: Parser25 (bug 6055) [0,[3,0]]", "{{{\n| \n<LI
CLASS=||\n >\n}}}'''MOVE MOUSE CURSOR OVER HERE");
-add("selser", "Fuzz testing: Parser25 (bug 6055) [0,[0,2]]", "{{{\n| \n<LI
CLASS=||\n >\n}}}blah\" onmouseover=\"alert('hello world');\"
align=\"left\"1y1dswlzouu40a4i'''MOVE MOUSE CURSOR OVER HERE");
-add("selser", "Fuzz testing: Parser25 (bug 6055) [0,[3,2]]", "{{{\n| \n<LI
CLASS=||\n >\n}}}4kaxzxt0lciejyvi'''MOVE MOUSE CURSOR OVER HERE");
-add("selser", "Fuzz testing: Parser25 (bug 6055) [0,[4,2]]", "{{{\n| \n<LI
CLASS=||\n >\n}}}xfilnfj1id89qkt9ugf2zkqtlng3c8fr'''MOVE MOUSE CURSOR OVER
HERE");
-add("selser", "Fuzz testing: Parser25 (bug 6055) [0,[4,0]]", "{{{\n| \n<LI
CLASS=||\n >\n}}}l2alwbk8ifab57b9'''MOVE MOUSE CURSOR OVER HERE");
-add("selser", "Fuzz testing: Parser25 (bug 6055) [0,[2,0]]", "{{{\n| \n<LI
CLASS=||\n >\n}}}ltl4kyjnfvoq1tt9blah\" onmouseover=\"alert('hello world');\"
align=\"left\"'''MOVE MOUSE CURSOR OVER HERE");
add("selser", "Inline wiki vs wiki block nesting [2,0,1]",
"y31p21pdfplgcik9\n\n'''Bold paragraph\n\nNew wiki paragraph");
add("selser", "Inline wiki vs wiki block nesting [1,3,[3]]", "'''Bold
paragraph\n");
add("selser", "Inline wiki vs wiki block nesting [0,0,4]", "'''Bold
paragraph\n\n1vaanv1vlmtb7qfr\n");
--
To view, visit https://gerrit.wikimedia.org/r/230270
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I8307b61bc784d28f7ce386152a500d6b78d36d87
Gerrit-PatchSet: 8
Gerrit-Project: mediawiki/services/parsoid
Gerrit-Branch: master
Gerrit-Owner: Arlolra <[email protected]>
Gerrit-Reviewer: Arlolra <[email protected]>
Gerrit-Reviewer: Cscott <[email protected]>
Gerrit-Reviewer: Subramanya Sastry <[email protected]>
Gerrit-Reviewer: Tim Starling <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits