Hello Krinkle, Legoktm, jenkins-bot, Siebrand, Tpt, Anomie, Jforrester, 
Jackmcbarn,

I'd like you to do a code review.  Please visit

    https://gerrit.wikimedia.org/r/381146

to review the following change.


Change subject: Revert "EditPage: Don't allow clients that mangle unicode to 
edit"
......................................................................

Revert "EditPage: Don't allow clients that mangle unicode to edit"

Shit hit fan.

This reverts commit 19b739bd19c437c5a637d85f431f139da7521508.

Bug: T67297
Change-Id: Ie57194ff2be97c19d6942a003886a9aa0f3778ff
---
M RELEASE-NOTES-1.30
M includes/DefaultSettings.php
M includes/EditPage.php
M includes/api/ApiEditPage.php
M languages/i18n/en.json
M languages/i18n/qqq.json
M tests/phpunit/includes/EditPageTest.php
7 files changed, 166 insertions(+), 48 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/core 
refs/changes/46/381146/1

diff --git a/RELEASE-NOTES-1.30 b/RELEASE-NOTES-1.30
index 513dc3a..8517a8f 100644
--- a/RELEASE-NOTES-1.30
+++ b/RELEASE-NOTES-1.30
@@ -73,7 +73,6 @@
 * (T138166) Added ability for users to prohibit other users from sending them
   emails with Special:Emailuser. Can be enabled by setting
   $wgEnableUserEmailBlacklist to true.
-* $wgBrowserBlacklist is deprecated, and changing it will have no effect.
 
 === External library changes in 1.30 ===
 
@@ -222,7 +221,6 @@
 * wfShellExec() and related functions are deprecated, use Shell::command().
 * (T138166) SpecialEmailUser::getTarget() now requires a second argument, the 
sending
   user object. Using the method without the second argument is deprecated.
-* (T67297) Browsers that don't support Unicode will have their edits rejected.
 
 == Compatibility ==
 MediaWiki 1.30 requires PHP 5.5.9 or later. There is experimental support for
diff --git a/includes/DefaultSettings.php b/includes/DefaultSettings.php
index 5630dcb..cdb7ce8 100644
--- a/includes/DefaultSettings.php
+++ b/includes/DefaultSettings.php
@@ -2980,9 +2980,46 @@
 $wgLegacyEncoding = false;
 
 /**
- * @deprecated since 1.30, does nothing
+ * Browser Blacklist for unicode non compliant browsers. Contains a list of
+ * regexps : "/regexp/"  matching problematic browsers. These browsers will
+ * be served encoded unicode in the edit box instead of real unicode.
  */
-$wgBrowserBlackList = [];
+$wgBrowserBlackList = [
+       /**
+        * Netscape 2-4 detection
+        * The minor version may contain strings such as "Gold" or "SGoldC-SGI"
+        * Lots of non-netscape user agents have "compatible", so it's useful 
to check for that
+        * with a negative assertion. The [UIN] identifier specifies the level 
of security
+        * in a Netscape/Mozilla browser, checking for it rules out a number of 
fakers.
+        * The language string is unreliable, it is missing on NS4 Mac.
+        *
+        * Reference: http://www.psychedelix.com/agents/index.shtml
+        */
+       '/^Mozilla\/2\.[^ ]+ [^(]*?\((?!compatible).*; [UIN]/',
+       '/^Mozilla\/3\.[^ ]+ [^(]*?\((?!compatible).*; [UIN]/',
+       '/^Mozilla\/4\.[^ ]+ [^(]*?\((?!compatible).*; [UIN]/',
+
+       /**
+        * MSIE on Mac OS 9 is teh sux0r, converts þ to <thorn>, ð to <eth>,
+        * Þ to <THORN> and Ð to <ETH>
+        *
+        * Known useragents:
+        * - Mozilla/4.0 (compatible; MSIE 5.0; Mac_PowerPC)
+        * - Mozilla/4.0 (compatible; MSIE 5.15; Mac_PowerPC)
+        * - Mozilla/4.0 (compatible; MSIE 5.23; Mac_PowerPC)
+        * - [...]
+        *
+        * @link 
https://en.wikipedia.org/w/index.php?diff=12356041&oldid=12355864
+        * @link https://en.wikipedia.org/wiki/Template%3AOS9
+        */
+       '/^Mozilla\/4\.0 \(compatible; MSIE \d+\.\d+; Mac_PowerPC\)/',
+
+       /**
+        * Google wireless transcoder, seems to eat a lot of chars alive
+        * 
https://it.wikipedia.org/w/index.php?title=Luciano_Ligabue&diff=prev&oldid=8857361
+        */
+       '/^Mozilla\/4\.0 \(compatible; MSIE 6.0; Windows NT 5.0; Google 
Wireless Transcoder;\)/'
+];
 
 /**
  * If set to true, the MediaWiki 1.4 to 1.5 schema conversion will
diff --git a/includes/EditPage.php b/includes/EditPage.php
index b40a054..0ea61c0 100644
--- a/includes/EditPage.php
+++ b/includes/EditPage.php
@@ -41,11 +41,6 @@
  */
 class EditPage {
        /**
-        * Used for Unicode support checks
-        */
-       const UNICODE_CHECK = 'ℳ𝒲♥𝓊𝓃𝒾𝒸ℴ𝒹ℯ';
-
-       /**
         * Status: Article successfully updated
         */
        const AS_SUCCESS_UPDATE = 200;
@@ -181,11 +176,6 @@
         * $wgContentHandlerUseDB being false
         */
        const AS_CANNOT_USE_CUSTOM_MODEL = 241;
-
-       /**
-        * Status: edit rejected because browser doesn't support Unicode.
-        */
-       const AS_UNICODE_NOT_SUPPORTED = 242;
 
        /**
         * HTML id and name for the beginning of the edit form.
@@ -422,11 +412,6 @@
         * @var bool Whether an old revision is edited
         */
        private $isOldRev = false;
-
-       /**
-        * @var string|null What the user submitted in the 'wpUnicodeCheck' 
field
-        */
-       private $unicodeCheck;
 
        /**
         * @param Article $article
@@ -880,7 +865,7 @@
                        # These fields need to be checked for encoding.
                        # Also remove trailing whitespace, but don't remove 
_initial_
                        # whitespace from the text boxes. This may be 
significant formatting.
-                       $this->textbox1 = rtrim( $request->getText( 
'wpTextbox1' ) );
+                       $this->textbox1 = $this->safeUnicodeInput( $request, 
'wpTextbox1' );
                        if ( !$request->getCheck( 'wpTextbox2' ) ) {
                                // Skip this if wpTextbox2 has input, it 
indicates that we came
                                // from a conflict page with raw page text, not 
a custom form
@@ -890,8 +875,6 @@
                                        $this->textbox1 = $textbox1;
                                }
                        }
-
-                       $this->unicodeCheck = $request->getText( 
'wpUnicodeCheck' );
 
                        $this->summary = $request->getText( 'wpSummary' );
 
@@ -1561,7 +1544,6 @@
 
                        case self::AS_CANNOT_USE_CUSTOM_MODEL:
                        case self::AS_PARSE_ERROR:
-                       case self::AS_UNICODE_NOT_SUPPORTED:
                                $out->addWikiText( '<div class="error">' . "\n" 
. $status->getWikiText() . '</div>' );
                                return true;
 
@@ -1760,12 +1742,6 @@
                        wfDebug( "Hook 'EditPage::attemptSave' aborted article 
saving\n" );
                        $status->fatal( 'hookaborted' );
                        $status->value = self::AS_HOOK_ERROR;
-                       return $status;
-               }
-
-               if ( $this->unicodeCheck !== self::UNICODE_CHECK ) {
-                       $status->fatal( 'unicode-support-fail' );
-                       $status->value = self::AS_UNICODE_NOT_SUPPORTED;
                        return $status;
                }
 
@@ -2697,9 +2673,6 @@
                        call_user_func_array( $formCallback, [ &$out ] );
                }
 
-               // Add a check for Unicode support
-               $out->addHTML( Html::hidden( 'wpUnicodeCheck', 
self::UNICODE_CHECK ) );
-
                // Add an empty field to trip up spambots
                $out->addHTML(
                        Xml::openElement( 'div', [ 'id' => 
'antispam-container', 'style' => 'display: none;' ] )
@@ -2974,6 +2947,10 @@
                                $out->addWikiText( $this->hookError );
                        }
 
+                       if ( !$this->checkUnicodeCompliantBrowser() ) {
+                               $out->addWikiMsg( 'nonunicodebrowser' );
+                       }
+
                        if ( $this->section != 'new' ) {
                                $revision = 
$this->mArticle->getRevisionFetched();
                                if ( $revision ) {
@@ -3244,6 +3221,10 @@
                $out->addHTML( Html::hidden( 'wpEdittime', $this->edittime ) );
                $out->addHTML( Html::hidden( 'editRevId', $this->editRevId ) );
                $out->addHTML( Html::hidden( 'wpScrolltop', $this->scrolltop, [ 
'id' => 'wpScrolltop' ] ) );
+
+               if ( !$this->checkUnicodeCompliantBrowser() ) {
+                       $out->addHTML( Html::hidden( 'safemode', '1' ) );
+               }
        }
 
        protected function showFormAfterText() {
@@ -3337,7 +3318,8 @@
        }
 
        protected function showTextbox( $text, $name, $customAttribs = [] ) {
-               $wikitext = $this->addNewLineAtEnd( $text );
+               $wikitext = $this->safeUnicodeOutput( $text );
+               $wikitext = $this->addNewLineAtEnd( $wikitext );
 
                $attribs = $this->buildTextboxAttribs( $name, $customAttribs, 
$this->context->getUser() );
 
@@ -4480,30 +4462,137 @@
        }
 
        /**
+        * Check if the browser is on a blacklist of user-agents known to
+        * mangle UTF-8 data on form submission. Returns true if Unicode
+        * should make it through, false if it's known to be a problem.
+        * @return bool
+        */
+       private function checkUnicodeCompliantBrowser() {
+               global $wgBrowserBlackList;
+
+               $currentbrowser = $this->context->getRequest()->getHeader( 
'User-Agent' );
+               if ( $currentbrowser === false ) {
+                       // No User-Agent header sent? Trust it by default...
+                       return true;
+               }
+
+               foreach ( $wgBrowserBlackList as $browser ) {
+                       if ( preg_match( $browser, $currentbrowser ) ) {
+                               return false;
+                       }
+               }
+               return true;
+       }
+
+       /**
         * Filter an input field through a Unicode de-armoring process if it
         * came from an old browser with known broken Unicode editing issues.
-        *
-        * @deprecated since 1.30, does nothing
         *
         * @param WebRequest $request
         * @param string $field
         * @return string
         */
        protected function safeUnicodeInput( $request, $field ) {
-               return rtrim( $request->getText( $field ) );
+               $text = rtrim( $request->getText( $field ) );
+               return $request->getBool( 'safemode' )
+                       ? $this->unmakeSafe( $text )
+                       : $text;
        }
 
        /**
         * Filter an output field through a Unicode armoring process if it is
         * going to an old browser with known broken Unicode editing issues.
         *
-        * @deprecated since 1.30, does nothing
-        *
         * @param string $text
         * @return string
         */
        protected function safeUnicodeOutput( $text ) {
-               return $text;
+               return $this->checkUnicodeCompliantBrowser()
+                       ? $text
+                       : $this->makeSafe( $text );
+       }
+
+       /**
+        * A number of web browsers are known to corrupt non-ASCII characters
+        * in a UTF-8 text editing environment. To protect against this,
+        * detected browsers will be served an armored version of the text,
+        * with non-ASCII chars converted to numeric HTML character references.
+        *
+        * Preexisting such character references will have a 0 added to them
+        * to ensure that round-trips do not alter the original data.
+        *
+        * @param string $invalue
+        * @return string
+        */
+       private function makeSafe( $invalue ) {
+               // Armor existing references for reversibility.
+               $invalue = strtr( $invalue, [ "&#x" => "&#x0" ] );
+
+               $bytesleft = 0;
+               $result = "";
+               $working = 0;
+               $valueLength = strlen( $invalue );
+               for ( $i = 0; $i < $valueLength; $i++ ) {
+                       $bytevalue = ord( $invalue[$i] );
+                       if ( $bytevalue <= 0x7F ) { // 0xxx xxxx
+                               $result .= chr( $bytevalue );
+                               $bytesleft = 0;
+                       } elseif ( $bytevalue <= 0xBF ) { // 10xx xxxx
+                               $working = $working << 6;
+                               $working += ( $bytevalue & 0x3F );
+                               $bytesleft--;
+                               if ( $bytesleft <= 0 ) {
+                                       $result .= "&#x" . strtoupper( dechex( 
$working ) ) . ";";
+                               }
+                       } elseif ( $bytevalue <= 0xDF ) { // 110x xxxx
+                               $working = $bytevalue & 0x1F;
+                               $bytesleft = 1;
+                       } elseif ( $bytevalue <= 0xEF ) { // 1110 xxxx
+                               $working = $bytevalue & 0x0F;
+                               $bytesleft = 2;
+                       } else { // 1111 0xxx
+                               $working = $bytevalue & 0x07;
+                               $bytesleft = 3;
+                       }
+               }
+               return $result;
+       }
+
+       /**
+        * Reverse the previously applied transliteration of non-ASCII 
characters
+        * back to UTF-8. Used to protect data from corruption by broken web 
browsers
+        * as listed in $wgBrowserBlackList.
+        *
+        * @param string $invalue
+        * @return string
+        */
+       private function unmakeSafe( $invalue ) {
+               $result = "";
+               $valueLength = strlen( $invalue );
+               for ( $i = 0; $i < $valueLength; $i++ ) {
+                       if ( ( substr( $invalue, $i, 3 ) == "&#x" ) && ( 
$invalue[$i + 3] != '0' ) ) {
+                               $i += 3;
+                               $hexstring = "";
+                               do {
+                                       $hexstring .= $invalue[$i];
+                                       $i++;
+                               } while ( ctype_xdigit( $invalue[$i] ) && ( $i 
< strlen( $invalue ) ) );
+
+                               // Do some sanity checks. These aren't needed 
for reversibility,
+                               // but should help keep the breakage down if 
the editor
+                               // breaks one of the entities whilst editing.
+                               if ( ( substr( $invalue, $i, 1 ) == ";" ) && ( 
strlen( $hexstring ) <= 6 ) ) {
+                                       $codepoint = hexdec( $hexstring );
+                                       $result .= 
UtfNormal\Utils::codepointToUtf8( $codepoint );
+                               } else {
+                                       $result .= "&#x" . $hexstring . substr( 
$invalue, $i, 1 );
+                               }
+                       } else {
+                               $result .= substr( $invalue, $i, 1 );
+                       }
+               }
+               // reverse the transform that we made for reversibility reasons.
+               return strtr( $result, [ "&#x0" => "&#x" ] );
        }
 
        /**
diff --git a/includes/api/ApiEditPage.php b/includes/api/ApiEditPage.php
index 94d6e97..4360b4d 100644
--- a/includes/api/ApiEditPage.php
+++ b/includes/api/ApiEditPage.php
@@ -266,7 +266,6 @@
                        'wpIgnoreBlankArticle' => true,
                        'wpIgnoreSelfRedirect' => true,
                        'bot' => $params['bot'],
-                       'wpUnicodeCheck' => EditPage::UNICODE_CHECK,
                ];
 
                if ( !is_null( $params['summary'] ) ) {
diff --git a/languages/i18n/en.json b/languages/i18n/en.json
index 6d06e40..47e6205 100644
--- a/languages/i18n/en.json
+++ b/languages/i18n/en.json
@@ -704,8 +704,8 @@
        "explainconflict": "Someone else has changed this page since you 
started editing it.\nThe upper text area contains the page text as it currently 
exists.\nYour changes are shown in the lower text area.\nYou will have to merge 
your changes into the existing text.\n<strong>Only</strong> the text in the 
upper text area will be saved when you press \"$1\".",
        "yourtext": "Your text",
        "storedversion": "Stored revision",
+       "nonunicodebrowser": "<strong>Warning: Your browser is not Unicode 
compliant.</strong>\nA workaround is in place to allow you to safely edit 
pages: Non-ASCII characters will appear in the edit box as hexadecimal codes.",
        "editingold": "<strong>Warning: You are editing an out-of-date revision 
of this page.</strong>\nIf you save it, any changes made since this revision 
will be lost.",
-       "unicode-support-fail": "It appears that your browser does not support 
Unicode. It is required to edit pages, so your edit was not saved.",
        "yourdiff": "Differences",
        "copyrightwarning": "Please note that all contributions to {{SITENAME}} 
are considered to be released under the $2 (see $1 for details).\nIf you do not 
want your writing to be edited mercilessly and redistributed at will, then do 
not submit it here.<br />\nYou are also promising us that you wrote this 
yourself, or copied it from a public domain or similar free 
resource.\n<strong>Do not submit copyrighted work without permission!</strong>",
        "copyrightwarning2": "Please note that all contributions to 
{{SITENAME}} may be edited, altered, or removed by other contributors.\nIf you 
do not want your writing to be edited mercilessly, then do not submit it 
here.<br />\nYou are also promising us that you wrote this yourself, or copied 
it from a public domain or similar free resource (see $1 for 
details).\n<strong>Do not submit copyrighted work without permission!</strong>",
diff --git a/languages/i18n/qqq.json b/languages/i18n/qqq.json
index 7534322..0a6e91b 100644
--- a/languages/i18n/qqq.json
+++ b/languages/i18n/qqq.json
@@ -896,8 +896,8 @@
        "explainconflict": "Appears at the top of a page when there is an edit 
conflict.\n\nParameters:\n* $1 – The label of the save button – one of 
{{msg-mw|savearticle}} or {{msg-mw|savechanges}} on save-labelled wiki, or 
{{msg-mw|publishpage}} or {{msg-mw|publishchanges}} on publish-labelled 
wikis.\n\nSee also:\n* {{msg-mw|Savearticle}}",
        "yourtext": "Used in Diff Preview page. The diff is between 
{{msg-mw|currentrev}} and {{msg-mw|yourtext}}.\n\nAlso used in Edit Conflict 
page; the diff between {{msg-mw|yourtext}} and {{msg-mw|storedversion}}.",
        "storedversion": "This is used in an edit conflict as the label for the 
top revision that has been stored, as opposed to your version 
{{msg-mw|yourtext}} that has not been stored which is shown at the bottom of 
the page.",
+       "nonunicodebrowser": "Used as warning when editing page.",
        "editingold": "Used as warning when editing an old revision of a page.",
-       "unicode-support-fail": "Error message shown to users if their browser 
doesn't support Unicode",
        "yourdiff": "Used as h2 header for the diff of the current version of a 
page with the user's version in case there is an edit conflict or a spam filter 
hit.",
        "copyrightwarning": "Copyright warning displayed under the edit box in 
editor. Parameters:\n* $1 - link\n* $2 - license name",
        "copyrightwarning2": "Copyright warning displayed under the edit box in 
editor\n*$1 - license name",
diff --git a/tests/phpunit/includes/EditPageTest.php 
b/tests/phpunit/includes/EditPageTest.php
index ad0d07a..9507811 100644
--- a/tests/phpunit/includes/EditPageTest.php
+++ b/tests/phpunit/includes/EditPageTest.php
@@ -165,10 +165,6 @@
                        $edit['wpStarttime'] = wfTimestampNow();
                }
 
-               if ( !isset( $edit['wpUnicodeCheck'] ) ) {
-                       $edit['wpUnicodeCheck'] = EditPage::UNICODE_CHECK;
-               }
-
                $req = new FauxRequest( $edit, true ); // session ??
 
                $article = new Article( $title );
@@ -701,8 +697,7 @@
                        'wpTextbox1' => serialize( 'non-text content' ),
                        'wpEditToken' => $user->getEditToken(),
                        'wpEdittime' => '',
-                       'wpStarttime' => wfTimestampNow(),
-                       'wpUnicodeCheck' => EditPage::UNICODE_CHECK,
+                       'wpStarttime' => wfTimestampNow()
                ];
 
                $req = new FauxRequest( $edit, true );

-- 
To view, visit https://gerrit.wikimedia.org/r/381146
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ie57194ff2be97c19d6942a003886a9aa0f3778ff
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/core
Gerrit-Branch: master
Gerrit-Owner: MaxSem <maxsem.w...@gmail.com>
Gerrit-Reviewer: Anomie <bjor...@wikimedia.org>
Gerrit-Reviewer: Jackmcbarn <jackmcb...@gmail.com>
Gerrit-Reviewer: Jforrester <jforres...@wikimedia.org>
Gerrit-Reviewer: Krinkle <krinklem...@gmail.com>
Gerrit-Reviewer: Legoktm <lego...@member.fsf.org>
Gerrit-Reviewer: Siebrand <siebr...@kitano.nl>
Gerrit-Reviewer: Tpt <thoma...@hotmail.fr>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to