Krinkle has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/82040


Change subject: Title: Add byte class to unicode class conversion for js
......................................................................

Title: Add byte class to unicode class conversion for js

The upcoming rewrite of mw.Title needs to use wgLegalTitleChars,
but for that to work, it needs to be converted into something
that can work in javascript.

Signed-off-by: Timo Tijhof <[email protected]>
Change-Id: I163f3d7e3a680d52640a93f4bd195d8209669918
---
M includes/Title.php
M includes/resourceloader/ResourceLoaderStartUpModule.php
M tests/phpunit/includes/TitleTest.php
3 files changed, 108 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/core 
refs/changes/40/82040/1

diff --git a/includes/Title.php b/includes/Title.php
index 734e009..82d7195 100644
--- a/includes/Title.php
+++ b/includes/Title.php
@@ -492,6 +492,41 @@
        }
 
        /**
+        * Utility method for converting a character sequence from bytes to 
Unicode.
+        *
+        * Primary usecase being converting $wgLegalTitleChars to a sequence 
usable in
+        * javascript, as PHP uses UTF-8 bytes where javascript uses Unicode 
code units.
+        *
+        * @param string $byteClass
+        * @return string
+        */
+       public static function convertByteClassToUnicodeClass( $byteClass ) {
+               // If the PHP class contains any bytes above \xFF, remove them 
and add in the whole of
+               // '\x80-\xFF' (more permissive).
+
+               $charClass = $byteClass;
+
+               $count1 = 0;
+               $charClass = preg_replace( 
'/(?<!\\\\)((?:\\\\\\\\)*)([\\\\]x[0-7][0-9A-Fa-f])-([\\\\]x[8-9A-Fa-f][0-9A-Fa-f])/',
 '$1$2-\\x7F', $charClass, -1, $count1 );
+
+               $count2 = 0;
+               $charClass = preg_replace( 
'/(?<!\\\\)((?:\\\\\\\\)*)([\\\\]x[8-9A-Fa-f][0-9A-Fa-f])-([\\\\]x[8-9A-Fa-f][0-9A-Fa-f])/'
 , '$1', $charClass, -1, $count2 );
+
+               $count3 = 0;
+               $charClass = preg_replace( 
'/([^\\\\])-[\\\\]x[8-9A-Fa-f][0-9A-Fa-f]/', '$1-\\x7F', $charClass, -1, 
$count3 );
+
+               $count4 = 0;
+               $charClass = preg_replace( 
'/(?<!\\\\)((?:\\\\\\\\)*)([\\\\]x[8-9A-Fa-f][0-9A-Fa-f])/', '$1', $charClass, 
-1, $count4 );
+
+               if ( $count1 || $count2 || $count3 || $count4 ) {
+                       // Allow every non-ascii sequence
+                       $charClass .= '\u0080-\uFFFF';
+               }
+
+               return $charClass;
+       }
+
+       /**
         * Get a string representation of a title suitable for
         * including in a search index
         *
diff --git a/includes/resourceloader/ResourceLoaderStartUpModule.php 
b/includes/resourceloader/ResourceLoaderStartUpModule.php
index 861ff18..1a30f69 100644
--- a/includes/resourceloader/ResourceLoaderStartUpModule.php
+++ b/includes/resourceloader/ResourceLoaderStartUpModule.php
@@ -95,6 +95,7 @@
                        'wgCookiePrefix' => $wgCookiePrefix,
                        'wgResourceLoaderMaxQueryLength' => 
$wgResourceLoaderMaxQueryLength,
                        'wgCaseSensitiveNamespaces' => $caseSensitiveNamespaces,
+                       'wgLegalTitleChars' => 
Title::convertByteClassToUnicodeClass( Title::legalChars() ),
                );
 
                wfRunHooks( 'ResourceLoaderGetConfigVars', array( &$vars ) );
@@ -102,6 +103,10 @@
                return $vars;
        }
 
+       protected static function getLegalTitleCharsForJS() {
+               $chars = Title::legalChars();
+       }
+
        /**
         * Gets registration code for all modules
         *
diff --git a/tests/phpunit/includes/TitleTest.php 
b/tests/phpunit/includes/TitleTest.php
index 33bd8d6..1e791ea 100644
--- a/tests/phpunit/includes/TitleTest.php
+++ b/tests/phpunit/includes/TitleTest.php
@@ -32,6 +32,74 @@
                }
        }
 
+       public static function provideConvertByteClassToUnicodeClass() {
+               return array(
+                       array(
+                               ' 
%!"$&\'()*,\\-.\\/0-9:;=?@A-Z\\\\^_`a-z~\\x80-\\xFF+',
+                               ' 
%!"$&\'()*,\\-.\\/0-9:;=?@A-Z\\\\^_`a-z~+\\u0080-\\uFFFF',
+                       ),
+                       array(
+                               'QWERTY\\x66-\\xFF+',
+                               'QWERTY\\x66-\\x7F+\\u0080-\\uFFFF',
+                       ),
+                       array(
+                               'QWERTY\\x66-\\xFD+',
+                               'QWERTY\\x66-\\x7F+\\u0080-\\uFFFF',
+                       ),
+                       array(
+                               'QWERTY\\x66-\\x79+',
+                               'QWERTY\\x66-\\x79+',
+                       ),
+                       array(
+                               'QWERTY\\x66-\\x80+',
+                               'QWERTY\\x66-\\x7F+\\u0080-\\uFFFF',
+                       ),
+                       array(
+                               'QWERTY\\x66-\\x80+\\x23',
+                               'QWERTY\\x66-\\x7F+\\x23\\u0080-\\uFFFF',
+                       ),
+                       array(
+                               'QWERTY\\x66-\\x80+\\xD3',
+                               'QWERTY\\x66-\\x7F+\\u0080-\\uFFFF',
+                       ),
+                       array(
+                               '\\\\\\x99',
+                               '\\\\\\u0080-\\uFFFF',
+                       ),
+                       array(
+                               '-\\x99',
+                               '-\\u0080-\\uFFFF',
+                       ),
+                       array(
+                               'QWERTY\\-\\x99',
+                               'QWERTY\\-\\u0080-\\uFFFF',
+                       ),
+                       array(
+                               '\\\\x99',
+                               '\\\\x99',
+                       ),
+                       array(
+                               'A-\\x9F',
+                               'A-\\x7F\\u0080-\\uFFFF',
+                       ),
+                       array(
+                               '\\x66-\\x77QWERTY\\x88-\\x91FXZ',
+                               '\\x66-\\x77QWERTYFXZ\\u0080-\\uFFFF',
+                       ),
+                       array(
+                               '\\x66-\\x99QWERTY\\xAA-\\xEEFXZ',
+                               '\\x66-\\x7FQWERTYFXZ\\u0080-\\uFFFF',
+                       ),
+               );
+       }
+
+       /**
+        * @dataProvider provideConvertByteClassToUnicodeClass
+        */
+       function testConvertByteClassToUnicodeClass( $byteClass, $unicodeClass, 
$message = null ) {
+               $this->assertEquals( $unicodeClass, 
Title::convertByteClassToUnicodeClass( $byteClass ), $message );
+       }
+
        /**
         * @dataProvider provideBug31100
         */

-- 
To view, visit https://gerrit.wikimedia.org/r/82040
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I163f3d7e3a680d52640a93f4bd195d8209669918
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/core
Gerrit-Branch: master
Gerrit-Owner: Krinkle <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to