Catrope has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/391348 )

Change subject: [WIP] Use Remex in Sanitizer::stripAllTags()
......................................................................

[WIP] Use Remex in Sanitizer::stripAllTags()

Using a real HTML tokenizer fixes bugs when < or > appear in attribute
values.

Bug: T179978
Change-Id: I53b98e6c877c00c03ff110914168b398559c9c3e
---
M autoload.php
M includes/Sanitizer.php
M includes/tidy/RemexDriver.php
A includes/tidy/RemexStripTagHandler.php
M tests/phpunit/includes/SanitizerTest.php
5 files changed, 43 insertions(+), 7 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/core 
refs/changes/48/391348/1

diff --git a/autoload.php b/autoload.php
index 8053f5e..caf8863 100644
--- a/autoload.php
+++ b/autoload.php
@@ -955,6 +955,7 @@
        'MediaWiki\\Tidy\\RemexCompatMunger' => __DIR__ . 
'/includes/tidy/RemexCompatMunger.php',
        'MediaWiki\\Tidy\\RemexDriver' => __DIR__ . 
'/includes/tidy/RemexDriver.php',
        'MediaWiki\\Tidy\\RemexMungerData' => __DIR__ . 
'/includes/tidy/RemexMungerData.php',
+       'MediaWiki\\Tidy\\RemexStripTagHandler' => __DIR__ . 
'/includes/tidy/RemexStripTagHandler.php',
        'MediaWiki\\Tidy\\TidyDriverBase' => __DIR__ . 
'/includes/tidy/TidyDriverBase.php',
        'MediaWiki\\Widget\\ComplexNamespaceInputWidget' => __DIR__ . 
'/includes/widget/ComplexNamespaceInputWidget.php',
        'MediaWiki\\Widget\\ComplexTitleInputWidget' => __DIR__ . 
'/includes/widget/ComplexTitleInputWidget.php',
diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php
index 4c99677..6ad2c46 100644
--- a/includes/Sanitizer.php
+++ b/includes/Sanitizer.php
@@ -1971,11 +1971,7 @@
         * @return string
         */
        static function stripAllTags( $text ) {
-               # Actual <tags>
-               $text = StringUtils::delimiterReplace( '<', '>', '', $text );
-
-               # Normalize &entities and whitespace
-               $text = self::decodeCharReferences( $text );
+               $text = MediaWiki\Tidy\RemexDriver::stripAllTags( $text );
                $text = self::normalizeWhitespace( $text );
 
                return $text;
diff --git a/includes/tidy/RemexDriver.php b/includes/tidy/RemexDriver.php
index e02af88..93f7f29 100644
--- a/includes/tidy/RemexDriver.php
+++ b/includes/tidy/RemexDriver.php
@@ -54,4 +54,16 @@
                ] );
                return $serializer->getResult();
        }
+
+       public static function stripAllTags( $html ) {
+               $handler = new RemexStripTagHandler;
+               $tokenizer = new Tokenizer( $handler, $html, [
+                       'ignoreErrors' => true,
+                       // don't ignore char refs, we want them to be decoded
+                       'ignoreNulls' => true,
+                       'skipPreprocess' => true,
+               ] );
+               $tokenizer->execute();
+               return $handler->getResult();
+       }
 }
diff --git a/includes/tidy/RemexStripTagHandler.php 
b/includes/tidy/RemexStripTagHandler.php
new file mode 100644
index 0000000..9ebdd62
--- /dev/null
+++ b/includes/tidy/RemexStripTagHandler.php
@@ -0,0 +1,28 @@
+<?php
+
+namespace MediaWiki\Tidy;
+
+use RemexHtml\Tokenizer\Attributes;
+use RemexHtml\Tokenizer\TokenHandler;
+use RemexHtml\Tokenizer\Tokenizer;
+
+/**
+ * @internal
+ */
+class RemexStripTagHandler implements TokenHandler {
+       private $text = '';
+       public function getResult() {
+               return $this->text;
+       }
+
+       function startDocument( Tokenizer $t, $fns, $fn ) {}
+       function endDocument( $pos ) {}
+       function error( $text, $pos ) {}
+       function characters( $text, $start, $length, $sourceStart, 
$sourceLength ) {
+               $this->text .= substr( $text, $start, $length );
+       }
+       function startTag( $name, Attributes $attrs, $selfClose, $sourceStart, 
$sourceLength ) {}
+       function endTag( $name, $sourceStart, $sourceLength ) {}
+       function doctype( $name, $public, $system, $quirks, $sourceStart, 
$sourceLength ) {}
+       function comment( $text, $sourceStart, $sourceLength ) {}
+}
diff --git a/tests/phpunit/includes/SanitizerTest.php 
b/tests/phpunit/includes/SanitizerTest.php
index 4a33125..33da650 100644
--- a/tests/phpunit/includes/SanitizerTest.php
+++ b/tests/phpunit/includes/SanitizerTest.php
@@ -530,8 +530,7 @@
                        [ '<p id="one">Foo</p><p id="two">Bar</p>', 'FooBar' ],
                        [ "<p>Foo</p>\n<p>Bar</p>", 'Foo Bar' ],
                        [ '<p>Hello &lt;strong&gt; wor&#x6c;&#100; 
caf&eacute;</p>', 'Hello <strong> world café' ],
-                       // This one is broken, see T179978
-                       //[ '<p><small data-foo=\'bar"&lt;baz>quux\'><a 
href="./Foo">Bar</a></small> Whee!</p>', 'Bar Whee!' ],
+                       [ '<p><small data-foo=\'bar"&lt;baz>quux\'><a 
href="./Foo">Bar</a></small> Whee!</p>', 'Bar Whee!' ],
                ];
        }
 

-- 
To view, visit https://gerrit.wikimedia.org/r/391348
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I53b98e6c877c00c03ff110914168b398559c9c3e
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/core
Gerrit-Branch: master
Gerrit-Owner: Catrope <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to