jenkins-bot has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/391348 )
Change subject: Use Remex in Sanitizer::stripAllTags()
......................................................................
Use Remex in Sanitizer::stripAllTags()
Using a real HTML tokenizer fixes bugs when < or > appear in attribute
values. The old implementation used delimiterReplace(), which didn't
handle this case:
> print Sanitizer::stripAllTags( '<p data-foo="a<b>c">Hello</p>' );
c">Hello
We also can't use PHP's built-in strip_tags() because it doesn't handle
<?php and <? correctly:
> print strip_tags('1<span class="<?php">2</span>3');
1
> print strip_tags('1<span class="<?">2</span>3');
1
Bug: T179978
Change-Id: I53b98e6c877c00c03ff110914168b398559c9c3e
---
M autoload.php
A includes/parser/RemexStripTagHandler.php
M includes/parser/Sanitizer.php
M tests/phpunit/includes/parser/SanitizerTest.php
4 files changed, 57 insertions(+), 12 deletions(-)
Approvals:
jenkins-bot: Verified
Jforrester: Looks good to me, approved
diff --git a/autoload.php b/autoload.php
index 2f6fbda..3f5a3c1 100644
--- a/autoload.php
+++ b/autoload.php
@@ -1218,6 +1218,7 @@
'RefreshLinks' => __DIR__ . '/maintenance/refreshLinks.php',
'RefreshLinksJob' => __DIR__ .
'/includes/jobqueue/jobs/RefreshLinksJob.php',
'RegexlikeReplacer' => __DIR__ .
'/includes/libs/replacers/RegexlikeReplacer.php',
+ 'RemexStripTagHandler' => __DIR__ .
'/includes/parser/RemexStripTagHandler.php',
'RemoveInvalidEmails' => __DIR__ .
'/maintenance/removeInvalidEmails.php',
'RemoveUnusedAccounts' => __DIR__ .
'/maintenance/removeUnusedAccounts.php',
'RenameDbPrefix' => __DIR__ . '/maintenance/renameDbPrefix.php',
diff --git a/includes/parser/RemexStripTagHandler.php
b/includes/parser/RemexStripTagHandler.php
new file mode 100644
index 0000000..2839147
--- /dev/null
+++ b/includes/parser/RemexStripTagHandler.php
@@ -0,0 +1,40 @@
+<?php
+
+use RemexHtml\Tokenizer\Attributes;
+use RemexHtml\Tokenizer\TokenHandler;
+use RemexHtml\Tokenizer\Tokenizer;
+
+/**
+ * @internal
+ */
+class RemexStripTagHandler implements TokenHandler {
+ private $text = '';
+ public function getResult() {
+ return $this->text;
+ }
+
+ function startDocument( Tokenizer $t, $fns, $fn ) {
+ // Do nothing.
+ }
+ function endDocument( $pos ) {
+ // Do nothing.
+ }
+ function error( $text, $pos ) {
+ // Do nothing.
+ }
+ function characters( $text, $start, $length, $sourceStart,
$sourceLength ) {
+ $this->text .= substr( $text, $start, $length );
+ }
+ function startTag( $name, Attributes $attrs, $selfClose, $sourceStart,
$sourceLength ) {
+ // Do nothing.
+ }
+ function endTag( $name, $sourceStart, $sourceLength ) {
+ // Do nothing.
+ }
+ function doctype( $name, $public, $system, $quirks, $sourceStart,
$sourceLength ) {
+ // Do nothing.
+ }
+ function comment( $text, $sourceStart, $sourceLength ) {
+ // Do nothing.
+ }
+}
diff --git a/includes/parser/Sanitizer.php b/includes/parser/Sanitizer.php
index 4c99677..7c9f563 100644
--- a/includes/parser/Sanitizer.php
+++ b/includes/parser/Sanitizer.php
@@ -1967,17 +1967,22 @@
* Warning: this return value must be further escaped for literal
* inclusion in HTML output as of 1.10!
*
- * @param string $text HTML fragment
+ * @param string $html HTML fragment
* @return string
*/
- static function stripAllTags( $text ) {
- # Actual <tags>
- $text = StringUtils::delimiterReplace( '<', '>', '', $text );
+ static function stripAllTags( $html ) {
+ // Use RemexHtml to tokenize $html and extract the text
+ $handler = new RemexStripTagHandler;
+ $tokenizer = new RemexHtml\Tokenizer\Tokenizer( $handler,
$html, [
+ 'ignoreErrors' => true,
+ // don't ignore char refs, we want them to be decoded
+ 'ignoreNulls' => true,
+ 'skipPreprocess' => true,
+ ] );
+ $tokenizer->execute();
+ $text = $handler->getResult();
- # Normalize &entities and whitespace
- $text = self::decodeCharReferences( $text );
$text = self::normalizeWhitespace( $text );
-
return $text;
}
diff --git a/tests/phpunit/includes/parser/SanitizerTest.php
b/tests/phpunit/includes/parser/SanitizerTest.php
index 269575b..d7e72e1 100644
--- a/tests/phpunit/includes/parser/SanitizerTest.php
+++ b/tests/phpunit/includes/parser/SanitizerTest.php
@@ -530,11 +530,10 @@
[ '<p id="one">Foo</p><p id="two">Bar</p>', 'FooBar' ],
[ "<p>Foo</p>\n<p>Bar</p>", 'Foo Bar' ],
[ '<p>Hello <strong> world
café</p>', 'Hello <strong> world cafĂ©' ],
- // This one is broken, see T179978
- //[
- // '<p><small data-foo=\'bar"<baz>quux\'><a
href="./Foo">Bar</a></small> Whee!</p>',
- // 'Bar Whee!'
- //],
+ [
+ '<p><small data-foo=\'bar"<baz>quux\'><a
href="./Foo">Bar</a></small> Whee!</p>',
+ 'Bar Whee!'
+ ],
[ '1<span class="<?php">2</span>3', '123' ],
[ '1<span class="<?">2</span>3', '123' ],
];
--
To view, visit https://gerrit.wikimedia.org/r/391348
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I53b98e6c877c00c03ff110914168b398559c9c3e
Gerrit-PatchSet: 3
Gerrit-Project: mediawiki/core
Gerrit-Branch: master
Gerrit-Owner: Catrope <[email protected]>
Gerrit-Reviewer: Catrope <[email protected]>
Gerrit-Reviewer: Jackmcbarn <[email protected]>
Gerrit-Reviewer: Jforrester <[email protected]>
Gerrit-Reviewer: TTO <[email protected]>
Gerrit-Reviewer: Tim Starling <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits