C. Scott Ananian has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/340225 )

Change subject: T159174: Strip U+0000 in wikitext
......................................................................

T159174: Strip U+0000 in wikitext

U+0000 is not allowed in HTML5, there's no reason to allow it in wikitext.

It simplifies our code if we can just strip them at the start.  Strip in
PST as well so they don't sneak into our database either.

Tweaked the EXT_LINK URLs to account for the fact that invalid characters
get transformed into U+FFFD when using Preprocessor_DOM.  See
https://github.com/wikimedia/mediawiki/commit/73649741ed1e2f557aec22a485598b199fdd2d09
for context on that change.

Change-Id: I3f67e92b61aacc87a40c3662085c84d1dac08bfb
---
M includes/parser/Parser.php
M languages/LanguageConverter.php
M tests/parser/extraParserTests.txt
3 files changed, 12 insertions(+), 4 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/core 
refs/changes/25/340225/1

diff --git a/includes/parser/Parser.php b/includes/parser/Parser.php
index 86aa06a..5f64c92 100644
--- a/includes/parser/Parser.php
+++ b/includes/parser/Parser.php
@@ -89,13 +89,15 @@
        # Everything except bracket, space, or control characters
        # \p{Zs} is unicode 'separator, space' category. It covers the space 
0x20
        # as well as U+3000 is IDEOGRAPHIC SPACE for T21052
-       const EXT_LINK_URL_CLASS = '[^][<>"\\x00-\\x20\\x7F\p{Zs}]';
+       # \x{FFFD} is the Unicode replacement character, which Preprocessor_DOM
+       # uses to replace invalid HTML characters.
+       const EXT_LINK_URL_CLASS = '[^][<>"\\x00-\\x20\\x7F\p{Zs}\x{FFFD}]';
        # Simplified expression to match an IPv4 or IPv6 address, or
        # at least one character of a host name (embeds EXT_LINK_URL_CLASS)
-       const EXT_LINK_ADDR = 
'(?:[0-9.]+|\\[(?i:[0-9a-f:.]+)\\]|[^][<>"\\x00-\\x20\\x7F\p{Zs}])';
+       const EXT_LINK_ADDR = 
'(?:[0-9.]+|\\[(?i:[0-9a-f:.]+)\\]|[^][<>"\\x00-\\x20\\x7F\p{Zs}\x{FFFD}])';
        # RegExp to make image URLs (embeds IPv6 part of EXT_LINK_ADDR)
        // @codingStandardsIgnoreStart Generic.Files.LineLength
-       const EXT_IMAGE_REGEX = 
'/^(http:\/\/|https:\/\/)((?:\\[(?i:[0-9a-f:.]+)\\])?[^][<>"\\x00-\\x20\\x7F\p{Zs}]+)
+       const EXT_IMAGE_REGEX = 
'/^(http:\/\/|https:\/\/)((?:\\[(?i:[0-9a-f:.]+)\\])?[^][<>"\\x00-\\x20\\x7F\p{Zs}\x{FFFD}]+)
                
\\/([A-Za-z0-9_.,~%\\-+&;#*?!=()@\\x80-\\xFF]+)\\.((?i)gif|png|jpg|jpeg)$/Sxu';
        // @codingStandardsIgnoreEnd
 
@@ -264,7 +266,7 @@
                $this->mUrlProtocols = wfUrlProtocols();
                $this->mExtLinkBracketedRegex = '/\[(((?i)' . 
$this->mUrlProtocols . ')' .
                        self::EXT_LINK_ADDR .
-                       self::EXT_LINK_URL_CLASS . 
'*)\p{Zs}*([^\]\\x00-\\x08\\x0a-\\x1F]*?)\]/Su';
+                       self::EXT_LINK_URL_CLASS . 
'*)\p{Zs}*([^\]\\x00-\\x08\\x0a-\\x1F\\x{FFFD}]*?)\]/Su';
                if ( isset( $conf['preprocessorClass'] ) ) {
                        $this->mPreprocessorClass = $conf['preprocessorClass'];
                } elseif ( defined( 'HPHP_VERSION' ) ) {
@@ -417,6 +419,8 @@
                        $text = strtr( $text, "\x7f", "?" );
                        $magicScopeVariable = $this->lock();
                }
+               // Strip U+0000 NULL (T159174)
+               $text = str_replace( "\000", '', $text );
 
                $this->startParse( $title, $options, self::OT_HTML, $clearState 
);
 
@@ -4462,6 +4466,9 @@
                $this->startParse( $title, $options, self::OT_WIKI, $clearState 
);
                $this->setUser( $user );
 
+               // Strip U+0000 NULL (T159174)
+               $text = str_replace( "\000", '', $text );
+
                // We still normalize line endings for backwards-compatibility
                // with other code that just calls PST, but this should already
                // be handled in TextContent subclasses
diff --git a/languages/LanguageConverter.php b/languages/LanguageConverter.php
index 06fec44..8607d59 100644
--- a/languages/LanguageConverter.php
+++ b/languages/LanguageConverter.php
@@ -380,6 +380,7 @@
                $literalBlob = '';
 
                // Guard against delimiter nulls in the input
+               // (should never happen: see T159174)
                $text = str_replace( "\000", '', $text );
 
                $markupMatches = null;
diff --git a/tests/parser/extraParserTests.txt 
b/tests/parser/extraParserTests.txt
index a48087e..8d042d7 100644
--- a/tests/parser/extraParserTests.txt
+++ b/tests/parser/extraParserTests.txt
Binary files differ

-- 
To view, visit https://gerrit.wikimedia.org/r/340225
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I3f67e92b61aacc87a40c3662085c84d1dac08bfb
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/core
Gerrit-Branch: master
Gerrit-Owner: C. Scott Ananian <canan...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to