Arlolra has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/230270

Change subject: Improve broken attribute heuristics
......................................................................

Improve broken attribute heuristics

 * Now that these rules are clearly marked for tables, we see where
   breaking on ! and | makes sense.

Change-Id: I8307b61bc784d28f7ce386152a500d6b78d36d87
---
M lib/pegTokenizer.pegjs.txt
1 file changed, 29 insertions(+), 29 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/parsoid 
refs/changes/70/230270/1

diff --git a/lib/pegTokenizer.pegjs.txt b/lib/pegTokenizer.pegjs.txt
index e9c6e88..b97b55f 100644
--- a/lib/pegTokenizer.pegjs.txt
+++ b/lib/pegTokenizer.pegjs.txt
@@ -1209,7 +1209,7 @@
     }
 
 could_be_attribute =
-    // quick sanity check before expensive attribute_preprocessor_text_line
+    // quick sanity check before expensive table_attribute_preprocessor_text
     // rule. Also try to parse on [|!+;] for now which seem to be common
     // syntax errors in production that hidden by the PHP parser (by stripping
     // the 'attributes').
@@ -1291,7 +1291,7 @@
 // The arrangement of chars is to emphasize the split between what's disallowed
 // by html5 and what's necessary to give directive a chance.
 generic_attribute_name
-  = r:( $[^ \t\0\n\r/=>"'!<&\[\]|{}\-]+
+  = r:( $[^ \t\0\n\r/=>"'<&\[\]{}\-]+
         / ! inline_breaks
           ! '/>'
           // /=>"' is the html5 attribute name set we do not want.
@@ -1305,7 +1305,7 @@
 // Same as generic_attribute_name, except for eating tags found here, like php.
 // That doesn't make sense (ie. match php) in the generic case.
 table_attribute_name
-  = r:( $[^ \t\0\n\r/=>"'!<&\[\]|{}\-]+
+  = r:( $[^ \t\0\n\r/=>"'<&\[\]{}\-!|]+
         / ! inline_breaks
           ! '/>'
           // /=>"' is the html5 attribute name set we do not want.
@@ -1338,8 +1338,8 @@
       t1:attribute_preprocessor_text_single? "'" {
         return tu.getAttributeValueAndSource(input, t1, startOffset(), 
endOffset() - 1);
       }
-      // Missing end quote: accept | and > look-ahead as heuristic
-      / t2:attribute_preprocessor_text_single_broken? &[|>] {
+      // Missing end quote: accept > look-ahead as heuristic
+      / t2:attribute_preprocessor_text_single_broken? &'>' {
         return tu.getAttributeValueAndSource(input, t2, startOffset(), 
endOffset());
       }
     ) { return r; }
@@ -1347,36 +1347,36 @@
       t1:attribute_preprocessor_text_double? '"' {
         return tu.getAttributeValueAndSource(input, t1, startOffset(), 
endOffset() - 1);
       }
-      // Missing end quote: accept | and > look-ahead as heuristic
-      / t2:attribute_preprocessor_text_double_broken? &[|>] {
+      // Missing end quote: accept > look-ahead as heuristic
+      / t2:attribute_preprocessor_text_double_broken? &'>' {
         return tu.getAttributeValueAndSource(input, t2, startOffset(), 
endOffset());
       }
     ) { return r; }
-  / s:$space_or_newline* t:attribute_preprocessor_text !"=" {
+  / s:$space_or_newline* t:attribute_preprocessor_text !'=' {
       return tu.getAttributeValueAndSource(input, t, startOffset() + s.length, 
endOffset());
     }
 
 // Attribute value, restricted to a single line.
 table_att_value
   = space* "'" r:(
-      t1:attribute_preprocessor_text_single_line?  "'" {
+      t1:table_attribute_preprocessor_text_single?  "'" {
         return tu.getAttributeValueAndSource(input, t1, startOffset(), 
endOffset() - 1);
       }
-      // Missing end quote: accept | and > look-ahead as heuristic
-      / t2:attribute_preprocessor_text_single_line_broken? &[|>\n] {
+      // Missing end quote: accept |, !, and \n look-ahead as heuristic
+      / t2:table_attribute_preprocessor_text_single_broken? &[|!\n] {
         return tu.getAttributeValueAndSource(input, t2, startOffset(), 
endOffset());
       }
     ) { return r; }
   / space* '"' r:(
-      t1:attribute_preprocessor_text_double_line? '"' {
+      t1:table_attribute_preprocessor_text_double? '"' {
         return tu.getAttributeValueAndSource(input, t1, startOffset(), 
endOffset() - 1);
       }
-      // Missing end quote: accept | and > look-ahead as heuristic
-      / t2:attribute_preprocessor_text_double_line_broken? &[|>\n] {
+      // Missing end quote: accept |, !, and \n look-ahead as heuristic
+      / t2:table_attribute_preprocessor_text_double_broken? &[|!\n] {
         return tu.getAttributeValueAndSource(input, t2, startOffset(), 
endOffset());
       }
     ) { return r; }
-  / s:$space* t:attribute_preprocessor_text_line !"=" {
+  / s:$space* t:table_attribute_preprocessor_text !'=' {
       return tu.getAttributeValueAndSource(input, t, startOffset() + s.length, 
endOffset());
     }
 
@@ -2069,7 +2069,7 @@
   }
 
 attribute_preprocessor_text_single
-  = r:( $[^{}&'<\-]+
+  = r:( $[^{}&<\-']+
   / !inline_breaks s:(
       directive
     / [{}&<\-] ) { return s; }
@@ -2079,7 +2079,7 @@
   }
 
 attribute_preprocessor_text_single_broken
-  = r:( $[^{}&'<>|\-]+
+  = r:( $[^{}&<\-'>]+
   / !inline_breaks s:(
       directive
     / [{}&<\-] ) { return s; }
@@ -2089,7 +2089,7 @@
   }
 
 attribute_preprocessor_text_double
-  = r:( $[^{}&"<\-]+
+  = r:( $[^{}&<\-"]+
   / !inline_breaks s:(
       directive
     / [{}&<\-] ) { return s; }
@@ -2099,7 +2099,7 @@
   }
 
 attribute_preprocessor_text_double_broken
-  = r:( $[^{}&"<>|\-]+
+  = r:( $[^{}&<\-">]+
   / !inline_breaks s:(
       directive
     / [{}&<\-] ) { return s; }
@@ -2109,8 +2109,8 @@
   }
 
 // Variants with the entire attribute on a single line
-attribute_preprocessor_text_line
-  = r:( $[^=<>\n\r&'"\t \[\]|{}/!\-]+
+table_attribute_preprocessor_text
+  = r:( $[^=<>\n\r&'"\t \[\]{}/\-|!]+
         / !inline_breaks
           !'/>'
           t:( directive
@@ -2118,29 +2118,29 @@
           ) { return t; }
     )+ { return tu.flattenString(r); }
 
-attribute_preprocessor_text_single_line
-  = r:( $[^{}&'<\n\-]+
+table_attribute_preprocessor_text_single
+  = r:( $[^{}&<\-'\n]+
   / !inline_breaks s:( directive / $[{}&<\-] ) { return s; }
   )* {
       return tu.flattenString(r);
   }
 
-attribute_preprocessor_text_single_line_broken
-  = r:( $[^{}&'<>|!\n\-]+
+table_attribute_preprocessor_text_single_broken
+  = r:( $[^{}&<\-'\n|!]+
   / !inline_breaks s:( directive / $[{}&<\-] ) { return s; }
   )* {
       return tu.flattenString(r);
   }
 
-attribute_preprocessor_text_double_line
-  = r:( $[^{}&"<\n\-]+
+table_attribute_preprocessor_text_double
+  = r:( $[^{}&<\-"\n]+
   / !inline_breaks s:( directive / $[{}&<\-] ) { return s; }
   )* {
       return tu.flattenString(r);
   }
 
-attribute_preprocessor_text_double_line_broken
-  = r:( $[^{}&"<>|!\n\-]+
+table_attribute_preprocessor_text_double_broken
+  = r:( $[^{}&<\-"\n|!]+
   / !inline_breaks s:( directive / $[{}&<\-] ) { return s; }
   )* {
       return tu.flattenString(r);

-- 
To view, visit https://gerrit.wikimedia.org/r/230270
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I8307b61bc784d28f7ce386152a500d6b78d36d87
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/parsoid
Gerrit-Branch: master
Gerrit-Owner: Arlolra <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to