[MediaWiki-commits] [Gerrit] mediawiki...parsoid[master]: Preprocessor precedence: rightmost-opening/"broken-link"/"br...

2017-05-16 Thread jenkins-bot (Code Review)
jenkins-bot has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/350867 )

Change subject: Preprocessor precedence: 
rightmost-opening/"broken-link"/"broken-template"
..


Preprocessor precedence: rightmost-opening/"broken-link"/"broken-template"

Match the behavior of the PHP preprocessor for links or templates missing
their close brackets or misnested, by using a shared stack for brackets in
inlineBreaks.  We only stop for the close-symbol of the rightmost opened
construct; see [[:mw:Preprocessor_ABNF]] for details.  So:
  [[Foo|bar}}]]
and
  [[Foo}}|bar]]
are both valid, even if there's an template opening somewhere to the left.
Note however that the latter example will not become a link in the output
because }} is not a valid title character -- it is unlinked at a later
stage.  Similarly:
  [[Foo|{{echo|]]}}
is a broken link, because the square brackets are not inline breaks inside
the template.

Note that there is a deliberate incompatiblity with PHP.  Because PHP has
a separate preprocessor stage that recognizes *but does not mark*
wikilinks, markup like:
  [[Foo{{echo|]]}}
or
  [[Foo{{echo|]]
is parsed as a broken wikilink in the PHP preprocessor, but it then emits
an intermediate result (`[[Foo]]` or `[[Foo{{echo|]]`) which is then
later parsed as a wikilink by the PHP parser.  This is arguably a bug,
which could be fixed by having the PHP preprocessor either explicitly
mark wikilinks, or else HTML-escape `[[` when it appears in the
`broken-wikilink` production. A rough sketch of a PHP-side fix can be
found in I0dd4512dc9014fc8e329e682be20184867cafe80.

Change-Id: I8a2dd4471383b389dc17fefb7917305aefcb3141
---
M lib/wt2html/pegTokenizer.pegjs
M lib/wt2html/tokenizer.utils.js
M tests/parserTests-blacklist.js
M tests/parserTests.txt
4 files changed, 166 insertions(+), 48 deletions(-)

Approvals:
  jenkins-bot: Verified
  Arlolra: Looks good to me, approved



diff --git a/lib/wt2html/pegTokenizer.pegjs b/lib/wt2html/pegTokenizer.pegjs
index 5e09a0c..2664c17 100644
--- a/lib/wt2html/pegTokenizer.pegjs
+++ b/lib/wt2html/pegTokenizer.pegjs
@@ -780,23 +780,29 @@
  * 5: {·} → {{·{{{·}}}·}}
  * 6: {{·}} → {{{·{{{·}}}·}}}
  * 7: {{{·}}} → {·{{{·{{{·}}}·}}}·}
+ * This is only if close has > 3 braces; otherwise we just match open
+ * and close as we find them.
  */
 tplarg_or_template
-= & {
+  = &'{{' &{
   // Refuse to recurse beyond 40 levels. Default in the PHP parser
   // is $wgMaxTemplateDepth = 40; This is to prevent crashing from
   // buggy wikitext with lots of unclosed template calls, as in
   // eswiki/Usuario:C%C3%A1rdenas/PRUEBAS?oldid=651094
   if (stops.onCount('templatedepth') === undefined ||
   stops.onCount('templatedepth') < 40) {
-return stops.inc('templatedepth');
+return true;
   } else {
 return false;
   }
-}
-r:( &('{{' &('{{{'+ !'{') tplarg) a:template { return a; }
+} t:tplarg_or_template_guarded { return t; }
+
+tplarg_or_template_guarded
+  = &{ return stops.inc('templatedepth'); }
+r:( &('{{' &('{{{'+ !'{') tplarg) a:(template/broken_template) { return a; 
}
   / a:$('{' &('{{{'+ !'{'))? b:tplarg { return [a].concat(b); }
   / a:$('{' &('{{' !'{'))? b:template { return [a].concat(b); }
+  / a:broken_template { return a; }
 ) {
   stops.dec('templatedepth');
   return r;
@@ -807,6 +813,48 @@
 = r:(tplarg_or_template / .)+ { return tu.flattenIfArray(r); }
 
 template
+  = stopLen:("" { return stops.push('preproc', /* {{ */'}}'); })
+t:( template_preproc / &{ return stops.popTo('preproc', stopLen); } )
+{ stops.popTo('preproc', stopLen); return t; }
+
+// The PHP preprocessor maintains a single stack of "closing token we
+// are currently looking for", with no backtracking.  This means that
+// once you see `[[ {{` you are looking only for `}}` -- if that template
+// turns out to be broken you will never pop the `}}` and there is no way
+// to close the `[[`.  Since the PEG tokenizer in Parsoid uses backtracking
+// and parses in a single pass (instead of PHP's split preprocessor/parser)
+// we have to be a little more careful when we emulate this behavior.
+// If we use a rule like:
+//   template = "{{" tplname tplargs* "}}"?
+// Then we end up having to reinterpret `tplname tplargs*` as a tlb if it
+// turns out we never find the `}}`, which involves a lot of tedious gluing
+// tokens back together with fingers crossed we haven't discarded any
+// significant newlines/whitespace/etc.  An alternative would be a rule like:
+//   broken_template = "{{" tlb
+// but again, `template` is used in many different contexts; `tlb` isn't
+// necessarily the right one to recursively invoke.  Instead we get the
+// broken template off of the PEGjs production stack by returning immediately
+// after `{{`, but we leave a "broken token" on top of the preprocessor

[MediaWiki-commits] [Gerrit] mediawiki...parsoid[master]: Preprocessor precedence: rightmost-opening/"broken-link"/"br...

2017-04-28 Thread C. Scott Ananian (Code Review)
C. Scott Ananian has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/350867 )

Change subject: Preprocessor precedence: 
rightmost-opening/"broken-link"/"broken-template"
..

Preprocessor precedence: rightmost-opening/"broken-link"/"broken-template"

Match the behavior of the PHP preprocessor for links or templates missing
their close brackets or misnested, by using a shared stack for brackets in
inlineBreaks.  We only stop for the close-symbol of the rightmost opened
construct; see [[:mw:Preprocessor_ABNF]] for details.  So:
  [[Foo|bar}}]]
and
  [[Foo}}|bar]]
are both valid, even if there's an template opening somewhere to the left.
Note however that the latter example will not become a link in the output
because }} is not a valid title character -- it is unlinked at a later
stage.  Similarly:
  [[Foo|{{echo|]]}}
is a broken link, because the square brackets are not inline breaks inside
the template.

Change-Id: I8a2dd4471383b389dc17fefb7917305aefcb3141
---
M lib/wt2html/pegTokenizer.pegjs
M lib/wt2html/tokenizer.utils.js
M tests/parserTests-blacklist.js
M tests/parserTests.txt
4 files changed, 149 insertions(+), 46 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/parsoid 
refs/changes/67/350867/1

diff --git a/lib/wt2html/pegTokenizer.pegjs b/lib/wt2html/pegTokenizer.pegjs
index 473f70b..3ebf28d 100644
--- a/lib/wt2html/pegTokenizer.pegjs
+++ b/lib/wt2html/pegTokenizer.pegjs
@@ -780,33 +780,51 @@
  * 5: {·} → {{·{{{·}}}·}}
  * 6: {{·}} → {{{·{{{·}}}·}}}
  * 7: {{{·}}} → {·{{{·{{{·}}}·}}}·}
+ * This is only if close has > 3 braces; otherwise we just match open
+ * and close as we find them.
  */
 tplarg_or_template
-= & {
+  = &'{{' &{
   // Refuse to recurse beyond 40 levels. Default in the PHP parser
   // is $wgMaxTemplateDepth = 40; This is to prevent crashing from
   // buggy wikitext with lots of unclosed template calls, as in
   // eswiki/Usuario:C%C3%A1rdenas/PRUEBAS?oldid=651094
   if (stops.onCount('templatedepth') === undefined ||
   stops.onCount('templatedepth') < 40) {
-return stops.inc('templatedepth');
+return true;
   } else {
 return false;
   }
-}
-r:( &('{{' &('{{{'+ !'{') tplarg) a:template { return a; }
+} t:tplarg_or_template_guarded { return t; }
+
+tplarg_or_template_guarded
+  = &{ return stops.inc('templatedepth'); }
+r:( &('{{' &('{{{'+ !'{') tplarg) a:(template/broken_template) { return a; 
}
   / a:$('{' &('{{{'+ !'{'))? b:tplarg { return [a].concat(b); }
   / a:$('{' &('{{' !'{'))? b:template { return [a].concat(b); }
 ) {
   stops.dec('templatedepth');
   return r;
 }
+// deliberately fail to decrement templatedepth for a broken template
+/ a:broken_template { return a; }
 / & { return stops.dec('templatedepth'); }
 
 tplarg_or_template_or_bust "tplarg_or_template_or_bust"
 = r:(tplarg_or_template / .)+ { return tu.flattenIfArray(r); }
 
 template
+  = &{ return stops.push("preproc", /*{{*/"}}"); }
+t:template_preproc
+{ stops.pop("preproc"); return t; }
+  / &{ return stops.pop("preproc"); }
+
+broken_template
+  = &{ return stops.push("preproc", "broken"); }
+// for broken-template,  deliberately fail to pop the preproc stops stack
+t:"{{" { return t; }
+
+template_preproc
   = "{{" nl_comment_space*
 target:template_param_value
 params:(nl_comment_space* "|"
@@ -829,6 +847,12 @@
 } / $('{{' space_or_newline+ '}}')
 
 tplarg
+  = &{ return stops.push("preproc", /*{{{*/"}}}"); }
+t:tplarg_preproc
+{ stops.pop("preproc"); return t; }
+  / &{ return stops.pop("preproc"); }
+
+tplarg_preproc
   = "{{{"
 p:("" { return endOffset(); })
 target:template_param_value?
@@ -945,8 +969,18 @@
 }
 }
 
-// TODO: handle link prefixes as in al[[Razi]]
 wikilink
+  = &{ return stops.push("preproc", "]]"); }
+w:wikilink_preproc
+{ stops.pop("preproc"); return w; }
+  // `broken-link` (see [[:mw:Preprocessor_ABNF]]), but careful because the
+  // second bracket could start an extlink.  Deliberately leave entry
+  // on preproc stack since we haven't seen a double-close bracket.
+  / &"[[" &{ return stops.replace("preproc", "broken"); }
+a:"[" b:(extlink / "[") { return [a,b]; }
+  / &{ return stops.pop("preproc"); }
+
+wikilink_preproc
   = "[["
 ! url
 //target:link_target
@@ -960,12 +994,25 @@
   lcontent = { content: [] };
   }
 
+  var textTokens = [];
   if (target === null || lcontent.pipetrick) {
-return [text()];
+textTokens.push("[[");
+if (target) {
+  textTokens.push(target);
+}
+if (lcontent.pipetrick) {
+  textTokens.push("|");
+}
+lcontent.content.forEach(function(a) {
+  // a is a mw:maybeContent attribute
+  textTokens.push("|");
+