PleaseStand has uploaded a new change for review.
https://gerrit.wikimedia.org/r/85105
Change subject: Add /S modifier to remaining uses of ASCII check regex
......................................................................
Add /S modifier to remaining uses of ASCII check regex
This is a theoretical performance improvement that probably is only
slightly meaningful for megabyte-sized ASCII texts. However, I have
taken this opportunity to remove some code from LanguageEo that was
commented out in 2005 (bug 1512, r10997).
Also follows-up I0a06b10eeee9a6bb04529d669fed8c69a4d9c172 by changing
some of the double-quoted regexes back to single-quoted regexes.
Change-Id: Ifac5d576c8b60bbc6d956db85ea05f7db6b37c2b
---
M includes/StringUtils.php
M includes/normal/UtfNormal.php
M includes/search/SearchEngine.php
M languages/Language.php
M languages/classes/LanguageEo.php
5 files changed, 11 insertions(+), 35 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/core
refs/changes/05/85105/1
diff --git a/includes/StringUtils.php b/includes/StringUtils.php
index 9e21d03..9430e91 100644
--- a/includes/StringUtils.php
+++ b/includes/StringUtils.php
@@ -51,7 +51,7 @@
*/
static function isUtf8( $value, $disableMbstring = false ) {
$value = (string)$value;
- if ( preg_match( "/[\x80-\xff]/S", $value ) === 0 ) {
+ if ( preg_match( '/[\x80-\xff]/S', $value ) === 0 ) {
// String contains only ASCII characters, has to be
valid
return true;
}
@@ -65,7 +65,7 @@
}
return mb_check_encoding( $value, 'UTF-8' ) &&
- ( $newPHP || preg_match(
"/\xf4[\x90-\xbf]|[\xf5-\xff]/S", $value ) === 0 );
+ ( $newPHP || preg_match(
'/\xf4[\x90-\xbf]|[\xf5-\xff]/S', $value ) === 0 );
}
// PCRE implements repetition using recursion; to avoid a stack
overflow (and segfault)
@@ -74,7 +74,7 @@
// used rather than a single long regex for performance.
static $regexes;
if ( $regexes === null ) {
- $cont = "[\x80-\xbf]";
+ $cont = '[\x80-\xbf]';
$after = "(?!$cont)"; // "(?:[^\x80-\xbf]|$)" would
work here
$regexes = array(
// Continuation byte at the start
@@ -84,7 +84,7 @@
"/[\\x00-\x7f]$cont/S",
// Illegal byte
- "/[\xc0\xc1\xf5-\xff]/S",
+ '/[\xc0\xc1\xf5-\xff]/S',
// Invalid 2-byte sequence, or valid one then
an extra continuation byte
"/[\xc2-\xdf](?!$cont$after)/S",
diff --git a/includes/normal/UtfNormal.php b/includes/normal/UtfNormal.php
index 5a091af..5567ce5 100644
--- a/includes/normal/UtfNormal.php
+++ b/includes/normal/UtfNormal.php
@@ -140,7 +140,7 @@
return normalizer_normalize( $string,
Normalizer::FORM_D );
elseif( NORMALIZE_ICU )
return utf8_normalize( $string, self::UNORM_NFD );
- elseif( preg_match( '/[\x80-\xff]/', $string ) )
+ elseif( preg_match( '/[\x80-\xff]/S', $string ) )
return UtfNormal::NFD( $string );
else
return $string;
@@ -159,7 +159,7 @@
return normalizer_normalize( $string,
Normalizer::FORM_KC );
elseif( NORMALIZE_ICU )
return utf8_normalize( $string, self::UNORM_NFKC );
- elseif( preg_match( '/[\x80-\xff]/', $string ) )
+ elseif( preg_match( '/[\x80-\xff]/S', $string ) )
return UtfNormal::NFKC( $string );
else
return $string;
@@ -178,7 +178,7 @@
return normalizer_normalize( $string,
Normalizer::FORM_KD );
elseif( NORMALIZE_ICU )
return utf8_normalize( $string, self::UNORM_NFKD );
- elseif( preg_match( '/[\x80-\xff]/', $string ) )
+ elseif( preg_match( '/[\x80-\xff]/S', $string ) )
return UtfNormal::NFKD( $string );
else
return $string;
@@ -203,7 +203,7 @@
static function quickIsNFC( $string ) {
# ASCII is always valid NFC!
# If it's pure ASCII, let it through.
- if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
+ if( !preg_match( '/[\x80-\xff]/S', $string ) ) return true;
UtfNormal::loadData();
$len = strlen( $string );
@@ -247,7 +247,7 @@
# ASCII is always valid NFC!
# If we're only ever given plain ASCII, we can avoid the
overhead
# of initializing the decomposition tables by skipping out
early.
- if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
+ if( !preg_match( '/[\x80-\xff]/S', $string ) ) return true;
static $checkit = null, $tailBytes = null, $utfCheckOrCombining
= null;
if( !isset( $checkit ) ) {
diff --git a/includes/search/SearchEngine.php b/includes/search/SearchEngine.php
index e5925fa..741ebee 100644
--- a/includes/search/SearchEngine.php
+++ b/includes/search/SearchEngine.php
@@ -1111,7 +1111,7 @@
// prepare regexps
foreach ( $terms as $index => $term ) {
// manually do upper/lowercase stuff for utf-8 since
PHP won't do it
- if ( preg_match( '/[\x80-\xff]/', $term ) ) {
+ if ( preg_match( '/[\x80-\xff]/S', $term ) ) {
$terms[$index] = preg_replace_callback(
'/./us', array( $this, 'caseCallback' ), $terms[$index] );
} else {
$terms[$index] = $term;
diff --git a/languages/Language.php b/languages/Language.php
index 356726f..38d7a31 100644
--- a/languages/Language.php
+++ b/languages/Language.php
@@ -2519,7 +2519,7 @@
* @return bool
*/
function isMultibyte( $str ) {
- return (bool)preg_match( '/[\x80-\xff]/', $str );
+ return (bool)preg_match( '/[\x80-\xff]/S', $str );
}
/**
diff --git a/languages/classes/LanguageEo.php b/languages/classes/LanguageEo.php
index 39bdfb5..0e2d8b4 100644
--- a/languages/classes/LanguageEo.php
+++ b/languages/classes/LanguageEo.php
@@ -114,30 +114,6 @@
return strtr( $matches[1], $xu ) . strtr( $matches[2], $xu );
}
- /**
- * @param $s string
- * @return string
- */
- function checkTitleEncoding( $s ) {
- # Check for X-system backwards-compatibility URLs
- $ishigh = preg_match( '/[\x80-\xff]/', $s );
- $isutf = preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
-
'[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s );
-
- if ( $ishigh and !$isutf ) {
- # Assume Latin1
- $s = utf8_encode( $s );
- } elseif ( preg_match(
'/(\xc4[\x88\x89\x9c\x9d\xa4\xa5\xb4\xb5]' .
- '|\xc5[\x9c\x9d\xac\xad])/', $s )
- ) {
- return $s;
- }
-
- // if( preg_match( '/[cghjsu]x/i', $s ) )
- // return $this->iconv( 'x', 'utf-8', $s );
- return $s;
- }
-
function initEncoding() {
global $wgEditEncoding;
$wgEditEncoding = 'x';
--
To view, visit https://gerrit.wikimedia.org/r/85105
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ifac5d576c8b60bbc6d956db85ea05f7db6b37c2b
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/core
Gerrit-Branch: master
Gerrit-Owner: PleaseStand <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits