C. Scott Ananian has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/333997 )
Change subject: Protect language converter markup in the preprocessor (take 2). ...................................................................... Protect language converter markup in the preprocessor (take 2). This revises 28774022769d2273be16c6c6e1cca710a1fd97ef, which was reverted in master due to unexpected issues with `-{{...}} ` markup on translatewiki and enwiki. Test cases are added to ensure that this is parsed as a template, not as language converter markup. https://www.mediawiki.org/wiki/Preprocessor_ABNF is the canonical documentation for the preprocessor; this will be updated after this patch is merged. The basic principles described in that page are maintained in this patch: * Rightmost opening structure has precedence: `-{{` is parsed as a dash followed by template opening. * `{{{` has precedence over `{{` and `-{`: `-{{{{` is parsed as `-{` `{{{` since we first grab the rightmost `{{{`. A bunch of test cases were added to verify the "ideal precedence" order described on that wiki page. Bug: T153761 Change-Id: I2f0c186c75e392c95e1a3d89266cae2586349150 --- M includes/parser/Preprocessor.php M includes/parser/Preprocessor_DOM.php M includes/parser/Preprocessor_Hash.php M tests/parser/parserTests.txt 4 files changed, 245 insertions(+), 35 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/core refs/changes/97/333997/1 diff --git a/includes/parser/Preprocessor.php b/includes/parser/Preprocessor.php index 426b550..cb8e3a7 100644 --- a/includes/parser/Preprocessor.php +++ b/includes/parser/Preprocessor.php @@ -51,9 +51,9 @@ ], '-{' => [ 'end' => '}-', - 'names' => [ 1 => null ], - 'min' => 1, - 'max' => 1, + 'names' => [ 2 => null ], + 'min' => 2, + 'max' => 2, ], ]; diff --git a/includes/parser/Preprocessor_DOM.php b/includes/parser/Preprocessor_DOM.php index 661318b..3cdd38c 100644 --- a/includes/parser/Preprocessor_DOM.php +++ b/includes/parser/Preprocessor_DOM.php @@ -223,8 +223,7 @@ $searchBase = "[{<\n"; # } if ( !$wgDisableLangConversion ) { - // FIXME: disabled due to T153761 - // $searchBase .= '-'; + $searchBase .= '-'; } // For fast reverse searches @@ -277,6 +276,13 @@ $search = $searchBase; if ( $stack->top === false ) { $currentClosing = ''; + } else if ( + $stack->top->close === '}-' && + $stack->top->count > 2 + ) { + # adjust closing for -{{{...{{ + $currentClosing = '}'; + $search .= $currentClosing; } else { $currentClosing = $stack->top->close; $search .= $currentClosing; @@ -333,11 +339,15 @@ } elseif ( isset( $this->rules[$curChar] ) ) { $found = 'open'; $rule = $this->rules[$curChar]; - } elseif ( $curChar == '-' ) { - $found = 'dash'; } else { - # Some versions of PHP have a strcspn which stops on null characters - # Ignore and continue + # Some versions of PHP have a strcspn which stops on + # null characters; ignore these and continue. + # We also may get '-' and '}' characters here which + # don't match -{ or $currentClosing. Add these to + # output and continue. + if ( $curChar == '-' || $curChar == '}' ) { + $accum .= $curChar; + } ++$i; continue; } @@ -615,7 +625,10 @@ } elseif ( $found == 'open' ) { # count opening brace characters $curLen = strlen( $curChar ); - $count = ( $curLen > 1 ) ? 1 : strspn( $text, $curChar, $i ); + $count = ( $curLen > 1 ) ? + # allow the final character to repeat + strspn( $text, $curChar[$curLen-1], $i+1 ) + 1 : + strspn( $text, $curChar, $i ); # we need to add to stack only if opening brace count is enough for one of the rules if ( $count >= $rule['min'] ) { @@ -635,17 +648,22 @@ # Add literal brace(s) $accum .= htmlspecialchars( str_repeat( $curChar, $count ) ); } - $i += $curLen * $count; + $i += $count; } elseif ( $found == 'close' ) { $piece = $stack->top; # lets check if there are enough characters for closing brace $maxCount = $piece->count; $curLen = strlen( $curChar ); - $count = ( $curLen > 1 ) ? 1 : strspn( $text, $curChar, $i, $maxCount ); + $count = ( $curLen > 1 ) ? $curLen : + strspn( $text, $curChar, $i, $maxCount ); # check for maximum matching characters (if there are 5 closing # characters, we will probably need only 3 - depending on the rules) $rule = $this->rules[$piece->open]; + if ( $piece->close === '}-' && $piece->count > 2 ) { + # tweak for -{..{{ }}..}- + $rule = $this->rules['{']; + } if ( $count > $rule['max'] ) { # The specified maximum exists in the callback array, unless the caller # has made an error @@ -663,14 +681,16 @@ if ( $matchingCount <= 0 ) { # No matching element found in callback array # Output a literal closing brace and continue - $accum .= htmlspecialchars( str_repeat( $curChar, $count ) ); - $i += $curLen * $count; + $endText = substr( $text, $i, $count ); + $accum .= htmlspecialchars( $endText ); + $i += $count; continue; } $name = $rule['names'][$matchingCount]; if ( $name === null ) { // No element, just literal text - $element = $piece->breakSyntax( $matchingCount ) . str_repeat( $rule['end'], $matchingCount ); + $endText = substr( $text, $i, $matchingCount ); + $element = $piece->breakSyntax( $matchingCount ) . $endText; } else { # Create XML element # Note: $parts is already XML, does not need to be encoded further @@ -703,7 +723,7 @@ } # Advance input pointer - $i += $curLen * $matchingCount; + $i += $matchingCount; # Unwind the stack $stack->pop(); @@ -719,7 +739,12 @@ $stack->push( $piece ); $accum =& $stack->getAccum(); } else { - $accum .= str_repeat( $piece->open, $piece->count ); + $s = substr( $piece->open, 0, -1); + $s .= str_repeat( + substr( $piece->open, -1 ), + $piece->count - strlen($s) + ); + $accum .= $s; } } $flags = $stack->getFlags(); @@ -924,7 +949,11 @@ if ( $openingCount === false ) { $openingCount = $this->count; } - $s = str_repeat( $this->open, $openingCount ); + $s = substr( $this->open, 0, -1); + $s .= str_repeat( + substr( $this->open, -1 ), + $openingCount - strlen($s) + ); $first = true; foreach ( $this->parts as $part ) { if ( $first ) { diff --git a/includes/parser/Preprocessor_Hash.php b/includes/parser/Preprocessor_Hash.php index 2666c93..c074b04 100644 --- a/includes/parser/Preprocessor_Hash.php +++ b/includes/parser/Preprocessor_Hash.php @@ -155,8 +155,7 @@ $searchBase = "[{<\n"; if ( !$wgDisableLangConversion ) { - // FIXME: disabled due to T153761 - // $searchBase .= '-'; + $searchBase .= '-'; } // For fast reverse searches @@ -208,6 +207,13 @@ $search = $searchBase; if ( $stack->top === false ) { $currentClosing = ''; + } else if ( + $stack->top->close === '}-' && + $stack->top->count > 2 + ) { + # adjust closing for -{{{...{{ + $currentClosing = '}'; + $search .= $currentClosing; } else { $currentClosing = $stack->top->close; $search .= $currentClosing; @@ -264,11 +270,15 @@ } elseif ( isset( $this->rules[$curChar] ) ) { $found = 'open'; $rule = $this->rules[$curChar]; - } elseif ( $curChar == '-' ) { - $found = 'dash'; } else { - # Some versions of PHP have a strcspn which stops on null characters - # Ignore and continue + # Some versions of PHP have a strcspn which stops on + # null characters; ignore these and continue. + # We also may get '-' and '}' characters here which + # don't match -{ or $currentClosing. Add these to + # output and continue. + if ( $curChar == '-' || $curChar == '}' ) { + self::addLiteral( $accum, $curChar ); + } ++$i; continue; } @@ -558,7 +568,10 @@ } elseif ( $found == 'open' ) { # count opening brace characters $curLen = strlen( $curChar ); - $count = ( $curLen > 1 ) ? 1 : strspn( $text, $curChar, $i ); + $count = ( $curLen > 1 ) ? + # allow the final character to repeat + strspn( $text, $curChar[$curLen-1], $i+1 ) + 1 : + strspn( $text, $curChar, $i ); # we need to add to stack only if opening brace count is enough for one of the rules if ( $count >= $rule['min'] ) { @@ -577,17 +590,22 @@ # Add literal brace(s) self::addLiteral( $accum, str_repeat( $curChar, $count ) ); } - $i += $curLen * $count; + $i += $count; } elseif ( $found == 'close' ) { $piece = $stack->top; # lets check if there are enough characters for closing brace $maxCount = $piece->count; $curLen = strlen( $curChar ); - $count = ( $curLen > 1 ) ? 1 : strspn( $text, $curChar, $i, $maxCount ); + $count = ( $curLen > 1 ) ? $curLen : + strspn( $text, $curChar, $i, $maxCount ); # check for maximum matching characters (if there are 5 closing # characters, we will probably need only 3 - depending on the rules) $rule = $this->rules[$piece->open]; + if ( $piece->close === '}-' && $piece->count > 2 ) { + # tweak for -{..{{ }}..}- + $rule = $this->rules['{']; + } if ( $count > $rule['max'] ) { # The specified maximum exists in the callback array, unless the caller # has made an error @@ -605,15 +623,17 @@ if ( $matchingCount <= 0 ) { # No matching element found in callback array # Output a literal closing brace and continue - self::addLiteral( $accum, str_repeat( $curChar, $count ) ); - $i += $curLen * $count; + $endText = substr( $text, $i, $count ); + self::addLiteral( $accum, $endText ); + $i += $count; continue; } $name = $rule['names'][$matchingCount]; if ( $name === null ) { // No element, just literal text + $endText = substr( $text, $i, $matchingCount ); $element = $piece->breakSyntax( $matchingCount ); - self::addLiteral( $element, str_repeat( $rule['end'], $matchingCount ) ); + self::addLiteral( $element, $endText ); } else { # Create XML element $parts = $piece->parts; @@ -648,7 +668,7 @@ } # Advance input pointer - $i += $curLen * $matchingCount; + $i += $matchingCount; # Unwind the stack $stack->pop(); @@ -664,7 +684,12 @@ $stack->push( $piece ); $accum =& $stack->getAccum(); } else { - self::addLiteral( $accum, str_repeat( $piece->open, $piece->count ) ); + $s = substr( $piece->open, 0, -1); + $s .= str_repeat( + substr( $piece->open, -1 ), + $piece->count - strlen($s) + ); + self::addLiteral( $accum, $s ); } } @@ -762,7 +787,12 @@ if ( $openingCount === false ) { $openingCount = $this->count; } - $accum = [ str_repeat( $this->open, $openingCount ) ]; + $s = substr( $this->open, 0, -1); + $s .= str_repeat( + substr( $this->open, -1 ), + $openingCount - strlen($s) + ); + $accum = [ $s ]; $lastIndex = 0; $first = true; foreach ( $this->parts as $part ) { diff --git a/tests/parser/parserTests.txt b/tests/parser/parserTests.txt index 42a1d1c..21fef66 100644 --- a/tests/parser/parserTests.txt +++ b/tests/parser/parserTests.txt @@ -11639,6 +11639,159 @@ !!end ### +### Preprocessor precedence tests +### See: https://www.mediawiki.org/wiki/Preprocessor_ABNF +### +##{{[[-{{{{{{[[Foo|bar}}]]}-}}}}}]] +!! test +Preprocessor precedence 1: link is rightmost opening +!! wikitext +{{[[Foo|bar}}]] +!! html +<p>{{<a href="/wiki/Foo" title="Foo">bar}}</a> +</p> +!! end + +!! test +Preprocessor precedence 2: template is rightmost opening +!! options +language=zh +!! wikitext +-{{echo|foo}-}}- +!! html +<p>-foo}-- +</p> +!! end + +!! test +Preprocessor precedence 3: language converter is rightmost opening +!! options +language=zh +!! wikitext +{{-{R|raw}}}- +!! html +<p>{{raw}} +</p> +!! end + +!! test +Preprocessor precedence 4: left-most angle bracket +!! options +language=zh +!! wikitext +<!--{raw}--> +!! html +!! end + +!! article +Template:Precedence5 +!! text +{{{{{1}}}}} +!! endarticle + +!! test +Preprocessor precedence 5: tplarg takes precedence over template +!! wikitext +{{Precedence5|Bullet}} +!! html +<ul><li> Bar</li></ul> + +!! end + +!! test +Preprocessor precedence 6: broken link is rightmost opening +!! wikitext +{{echo|[[Foo|bar|bat}} +!! html +<p>{{echo|[[Foo|bar|bat}} +</p> +!! end + +!! test +Preprocessor precedence 7: broken template is rightmost opening +!! wikitext +[[Foo{{echo|Bar]] +!! html +<p>[[Foo{{echo|Bar]] +</p> +!! end + +!! test +Preprocessor precedence 8: broken language converter is rightmost opening +!! options +language=zh +!! wikitext +[[Foo-{R|raw]] +!! html +<p>[[Foo-{R|raw]] +</p> +!! end + +!! article +Template:Preprocessor_precedence_9 +!! text +;4: {{{{1}}}} +;5: {{{{{2}}}}} +;6: {{{{{{3}}}}}} +;7: {{{{{{{4}}}}}}} +!! endarticle + +!! test +Preprocessor precedence 9: groups of braces +!! wikitext +{{Preprocessor precedence 9|Four|Bullet|1|2}} +!! html +<dl><dt>4</dt> +<dd> {Four}</dd> +<dt>5</dt> +<dd> </dd></dl> +<ul><li> Bar</li></ul> +<dl><dt>6</dt> +<dd> Four</dd> +<dt>7</dt> +<dd> {Bullet}</dd></dl> + +!! end + +!! article +Template:Preprocessor_precedence_10 +!! text +;1: -{R|raw}- +;2: -{{Bullet}}- +;3: -{{{1}}}- +;4: -{{{{2}}}}- +;5: -{{{{{3}}}}}- +;6: -{{{{{{4}}}}}}- +;7: -{{{{{{{5}}}}}}}- +!! endarticle + +!! test +Preprocessor precedence 10: groups of braces with leading dash +!! options +language=zh +!! wikitext +{{Preprocessor precedence 10|Three|raw2|Bullet|1|2}} +!! html +<dl><dt>1</dt> +<dd> raw</dd> +<dt>2</dt> +<dd> -</dd></dl> +<ul><li> Bar-</li></ul> +<dl><dt>3</dt> +<dd> -Three-</dd> +<dt>4</dt> +<dd> raw2</dd> +<dt>5</dt> +<dd> -</dd></dl> +<ul><li> Bar-</li></ul> +<dl><dt>6</dt> +<dd> -Three-</dd> +<dt>7</dt> +<dd> raw2</dd></dl> + +!! end + +### ### Token Stream Patcher tests ### ### These tests won't always pass wt2wt and other modes because @@ -20699,12 +20852,10 @@ </p> !! end -# FIXME: This test is currently broken in the PHP parser T153761 !! test T146304: Don't break template parsing if language converter markup is in the parameter. !! options language=sr variant=sr-ec -disabled !! wikitext {{echo|-{R|foo}-}} !! html/php -- To view, visit https://gerrit.wikimedia.org/r/333997 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I2f0c186c75e392c95e1a3d89266cae2586349150 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/core Gerrit-Branch: master Gerrit-Owner: C. Scott Ananian <canan...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits