Arlolra has uploaded a new change for review.
https://gerrit.wikimedia.org/r/230270
Change subject: Improve broken attribute heuristics
......................................................................
Improve broken attribute heuristics
* Now that these rules are clearly marked for tables, we see where
breaking on ! and | makes sense.
Change-Id: I8307b61bc784d28f7ce386152a500d6b78d36d87
---
M lib/pegTokenizer.pegjs.txt
1 file changed, 29 insertions(+), 29 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/parsoid
refs/changes/70/230270/1
diff --git a/lib/pegTokenizer.pegjs.txt b/lib/pegTokenizer.pegjs.txt
index e9c6e88..b97b55f 100644
--- a/lib/pegTokenizer.pegjs.txt
+++ b/lib/pegTokenizer.pegjs.txt
@@ -1209,7 +1209,7 @@
}
could_be_attribute =
- // quick sanity check before expensive attribute_preprocessor_text_line
+ // quick sanity check before expensive table_attribute_preprocessor_text
// rule. Also try to parse on [|!+;] for now which seem to be common
// syntax errors in production that hidden by the PHP parser (by stripping
// the 'attributes').
@@ -1291,7 +1291,7 @@
// The arrangement of chars is to emphasize the split between what's disallowed
// by html5 and what's necessary to give directive a chance.
generic_attribute_name
- = r:( $[^ \t\0\n\r/=>"'!<&\[\]|{}\-]+
+ = r:( $[^ \t\0\n\r/=>"'<&\[\]{}\-]+
/ ! inline_breaks
! '/>'
// /=>"' is the html5 attribute name set we do not want.
@@ -1305,7 +1305,7 @@
// Same as generic_attribute_name, except for eating tags found here, like php.
// That doesn't make sense (ie. match php) in the generic case.
table_attribute_name
- = r:( $[^ \t\0\n\r/=>"'!<&\[\]|{}\-]+
+ = r:( $[^ \t\0\n\r/=>"'<&\[\]{}\-!|]+
/ ! inline_breaks
! '/>'
// /=>"' is the html5 attribute name set we do not want.
@@ -1338,8 +1338,8 @@
t1:attribute_preprocessor_text_single? "'" {
return tu.getAttributeValueAndSource(input, t1, startOffset(),
endOffset() - 1);
}
- // Missing end quote: accept | and > look-ahead as heuristic
- / t2:attribute_preprocessor_text_single_broken? &[|>] {
+ // Missing end quote: accept > look-ahead as heuristic
+ / t2:attribute_preprocessor_text_single_broken? &'>' {
return tu.getAttributeValueAndSource(input, t2, startOffset(),
endOffset());
}
) { return r; }
@@ -1347,36 +1347,36 @@
t1:attribute_preprocessor_text_double? '"' {
return tu.getAttributeValueAndSource(input, t1, startOffset(),
endOffset() - 1);
}
- // Missing end quote: accept | and > look-ahead as heuristic
- / t2:attribute_preprocessor_text_double_broken? &[|>] {
+ // Missing end quote: accept > look-ahead as heuristic
+ / t2:attribute_preprocessor_text_double_broken? &'>' {
return tu.getAttributeValueAndSource(input, t2, startOffset(),
endOffset());
}
) { return r; }
- / s:$space_or_newline* t:attribute_preprocessor_text !"=" {
+ / s:$space_or_newline* t:attribute_preprocessor_text !'=' {
return tu.getAttributeValueAndSource(input, t, startOffset() + s.length,
endOffset());
}
// Attribute value, restricted to a single line.
table_att_value
= space* "'" r:(
- t1:attribute_preprocessor_text_single_line? "'" {
+ t1:table_attribute_preprocessor_text_single? "'" {
return tu.getAttributeValueAndSource(input, t1, startOffset(),
endOffset() - 1);
}
- // Missing end quote: accept | and > look-ahead as heuristic
- / t2:attribute_preprocessor_text_single_line_broken? &[|>\n] {
+ // Missing end quote: accept |, !, and \n look-ahead as heuristic
+ / t2:table_attribute_preprocessor_text_single_broken? &[|!\n] {
return tu.getAttributeValueAndSource(input, t2, startOffset(),
endOffset());
}
) { return r; }
/ space* '"' r:(
- t1:attribute_preprocessor_text_double_line? '"' {
+ t1:table_attribute_preprocessor_text_double? '"' {
return tu.getAttributeValueAndSource(input, t1, startOffset(),
endOffset() - 1);
}
- // Missing end quote: accept | and > look-ahead as heuristic
- / t2:attribute_preprocessor_text_double_line_broken? &[|>\n] {
+ // Missing end quote: accept |, !, and \n look-ahead as heuristic
+ / t2:table_attribute_preprocessor_text_double_broken? &[|!\n] {
return tu.getAttributeValueAndSource(input, t2, startOffset(),
endOffset());
}
) { return r; }
- / s:$space* t:attribute_preprocessor_text_line !"=" {
+ / s:$space* t:table_attribute_preprocessor_text !'=' {
return tu.getAttributeValueAndSource(input, t, startOffset() + s.length,
endOffset());
}
@@ -2069,7 +2069,7 @@
}
attribute_preprocessor_text_single
- = r:( $[^{}&'<\-]+
+ = r:( $[^{}&<\-']+
/ !inline_breaks s:(
directive
/ [{}&<\-] ) { return s; }
@@ -2079,7 +2079,7 @@
}
attribute_preprocessor_text_single_broken
- = r:( $[^{}&'<>|\-]+
+ = r:( $[^{}&<\-'>]+
/ !inline_breaks s:(
directive
/ [{}&<\-] ) { return s; }
@@ -2089,7 +2089,7 @@
}
attribute_preprocessor_text_double
- = r:( $[^{}&"<\-]+
+ = r:( $[^{}&<\-"]+
/ !inline_breaks s:(
directive
/ [{}&<\-] ) { return s; }
@@ -2099,7 +2099,7 @@
}
attribute_preprocessor_text_double_broken
- = r:( $[^{}&"<>|\-]+
+ = r:( $[^{}&<\-">]+
/ !inline_breaks s:(
directive
/ [{}&<\-] ) { return s; }
@@ -2109,8 +2109,8 @@
}
// Variants with the entire attribute on a single line
-attribute_preprocessor_text_line
- = r:( $[^=<>\n\r&'"\t \[\]|{}/!\-]+
+table_attribute_preprocessor_text
+ = r:( $[^=<>\n\r&'"\t \[\]{}/\-|!]+
/ !inline_breaks
!'/>'
t:( directive
@@ -2118,29 +2118,29 @@
) { return t; }
)+ { return tu.flattenString(r); }
-attribute_preprocessor_text_single_line
- = r:( $[^{}&'<\n\-]+
+table_attribute_preprocessor_text_single
+ = r:( $[^{}&<\-'\n]+
/ !inline_breaks s:( directive / $[{}&<\-] ) { return s; }
)* {
return tu.flattenString(r);
}
-attribute_preprocessor_text_single_line_broken
- = r:( $[^{}&'<>|!\n\-]+
+table_attribute_preprocessor_text_single_broken
+ = r:( $[^{}&<\-'\n|!]+
/ !inline_breaks s:( directive / $[{}&<\-] ) { return s; }
)* {
return tu.flattenString(r);
}
-attribute_preprocessor_text_double_line
- = r:( $[^{}&"<\n\-]+
+table_attribute_preprocessor_text_double
+ = r:( $[^{}&<\-"\n]+
/ !inline_breaks s:( directive / $[{}&<\-] ) { return s; }
)* {
return tu.flattenString(r);
}
-attribute_preprocessor_text_double_line_broken
- = r:( $[^{}&"<>|!\n\-]+
+table_attribute_preprocessor_text_double_broken
+ = r:( $[^{}&<\-"\n|!]+
/ !inline_breaks s:( directive / $[{}&<\-] ) { return s; }
)* {
return tu.flattenString(r);
--
To view, visit https://gerrit.wikimedia.org/r/230270
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I8307b61bc784d28f7ce386152a500d6b78d36d87
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/parsoid
Gerrit-Branch: master
Gerrit-Owner: Arlolra <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits