Krinkle has uploaded a new change for review. https://gerrit.wikimedia.org/r/82040
Change subject: Title: Add byte class to unicode class conversion for js ...................................................................... Title: Add byte class to unicode class conversion for js The upcoming rewrite of mw.Title needs to use wgLegalTitleChars, but for that to work, it needs to be converted into something that can work in javascript. Signed-off-by: Timo Tijhof <[email protected]> Change-Id: I163f3d7e3a680d52640a93f4bd195d8209669918 --- M includes/Title.php M includes/resourceloader/ResourceLoaderStartUpModule.php M tests/phpunit/includes/TitleTest.php 3 files changed, 108 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/core refs/changes/40/82040/1 diff --git a/includes/Title.php b/includes/Title.php index 734e009..82d7195 100644 --- a/includes/Title.php +++ b/includes/Title.php @@ -492,6 +492,41 @@ } /** + * Utility method for converting a character sequence from bytes to Unicode. + * + * Primary usecase being converting $wgLegalTitleChars to a sequence usable in + * javascript, as PHP uses UTF-8 bytes where javascript uses Unicode code units. + * + * @param string $byteClass + * @return string + */ + public static function convertByteClassToUnicodeClass( $byteClass ) { + // If the PHP class contains any bytes above \xFF, remove them and add in the whole of + // '\x80-\xFF' (more permissive). + + $charClass = $byteClass; + + $count1 = 0; + $charClass = preg_replace( '/(?<!\\\\)((?:\\\\\\\\)*)([\\\\]x[0-7][0-9A-Fa-f])-([\\\\]x[8-9A-Fa-f][0-9A-Fa-f])/', '$1$2-\\x7F', $charClass, -1, $count1 ); + + $count2 = 0; + $charClass = preg_replace( '/(?<!\\\\)((?:\\\\\\\\)*)([\\\\]x[8-9A-Fa-f][0-9A-Fa-f])-([\\\\]x[8-9A-Fa-f][0-9A-Fa-f])/' , '$1', $charClass, -1, $count2 ); + + $count3 = 0; + $charClass = preg_replace( '/([^\\\\])-[\\\\]x[8-9A-Fa-f][0-9A-Fa-f]/', '$1-\\x7F', $charClass, -1, $count3 ); + + $count4 = 0; + $charClass = preg_replace( '/(?<!\\\\)((?:\\\\\\\\)*)([\\\\]x[8-9A-Fa-f][0-9A-Fa-f])/', '$1', $charClass, -1, $count4 ); + + if ( $count1 || $count2 || $count3 || $count4 ) { + // Allow every non-ascii sequence + $charClass .= '\u0080-\uFFFF'; + } + + return $charClass; + } + + /** * Get a string representation of a title suitable for * including in a search index * diff --git a/includes/resourceloader/ResourceLoaderStartUpModule.php b/includes/resourceloader/ResourceLoaderStartUpModule.php index 861ff18..1a30f69 100644 --- a/includes/resourceloader/ResourceLoaderStartUpModule.php +++ b/includes/resourceloader/ResourceLoaderStartUpModule.php @@ -95,6 +95,7 @@ 'wgCookiePrefix' => $wgCookiePrefix, 'wgResourceLoaderMaxQueryLength' => $wgResourceLoaderMaxQueryLength, 'wgCaseSensitiveNamespaces' => $caseSensitiveNamespaces, + 'wgLegalTitleChars' => Title::convertByteClassToUnicodeClass( Title::legalChars() ), ); wfRunHooks( 'ResourceLoaderGetConfigVars', array( &$vars ) ); @@ -102,6 +103,10 @@ return $vars; } + protected static function getLegalTitleCharsForJS() { + $chars = Title::legalChars(); + } + /** * Gets registration code for all modules * diff --git a/tests/phpunit/includes/TitleTest.php b/tests/phpunit/includes/TitleTest.php index 33bd8d6..1e791ea 100644 --- a/tests/phpunit/includes/TitleTest.php +++ b/tests/phpunit/includes/TitleTest.php @@ -32,6 +32,74 @@ } } + public static function provideConvertByteClassToUnicodeClass() { + return array( + array( + ' %!"$&\'()*,\\-.\\/0-9:;=?@A-Z\\\\^_`a-z~\\x80-\\xFF+', + ' %!"$&\'()*,\\-.\\/0-9:;=?@A-Z\\\\^_`a-z~+\\u0080-\\uFFFF', + ), + array( + 'QWERTY\\x66-\\xFF+', + 'QWERTY\\x66-\\x7F+\\u0080-\\uFFFF', + ), + array( + 'QWERTY\\x66-\\xFD+', + 'QWERTY\\x66-\\x7F+\\u0080-\\uFFFF', + ), + array( + 'QWERTY\\x66-\\x79+', + 'QWERTY\\x66-\\x79+', + ), + array( + 'QWERTY\\x66-\\x80+', + 'QWERTY\\x66-\\x7F+\\u0080-\\uFFFF', + ), + array( + 'QWERTY\\x66-\\x80+\\x23', + 'QWERTY\\x66-\\x7F+\\x23\\u0080-\\uFFFF', + ), + array( + 'QWERTY\\x66-\\x80+\\xD3', + 'QWERTY\\x66-\\x7F+\\u0080-\\uFFFF', + ), + array( + '\\\\\\x99', + '\\\\\\u0080-\\uFFFF', + ), + array( + '-\\x99', + '-\\u0080-\\uFFFF', + ), + array( + 'QWERTY\\-\\x99', + 'QWERTY\\-\\u0080-\\uFFFF', + ), + array( + '\\\\x99', + '\\\\x99', + ), + array( + 'A-\\x9F', + 'A-\\x7F\\u0080-\\uFFFF', + ), + array( + '\\x66-\\x77QWERTY\\x88-\\x91FXZ', + '\\x66-\\x77QWERTYFXZ\\u0080-\\uFFFF', + ), + array( + '\\x66-\\x99QWERTY\\xAA-\\xEEFXZ', + '\\x66-\\x7FQWERTYFXZ\\u0080-\\uFFFF', + ), + ); + } + + /** + * @dataProvider provideConvertByteClassToUnicodeClass + */ + function testConvertByteClassToUnicodeClass( $byteClass, $unicodeClass, $message = null ) { + $this->assertEquals( $unicodeClass, Title::convertByteClassToUnicodeClass( $byteClass ), $message ); + } + /** * @dataProvider provideBug31100 */ -- To view, visit https://gerrit.wikimedia.org/r/82040 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I163f3d7e3a680d52640a93f4bd195d8209669918 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/core Gerrit-Branch: master Gerrit-Owner: Krinkle <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
