core)

PleaseStand (Code Review) Thu, 19 Sep 2013 13:31:18 -0700

PleaseStand has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/85105



Change subject: Add /S modifier to remaining uses of ASCII check regex
......................................................................

Add /S modifier to remaining uses of ASCII check regex

This is a theoretical performance improvement that probably is only
slightly meaningful for megabyte-sized ASCII texts. However, I have
taken this opportunity to remove some code from LanguageEo that was
commented out in 2005 (bug 1512, r10997).

Also follows-up I0a06b10eeee9a6bb04529d669fed8c69a4d9c172 by changing
some of the double-quoted regexes back to single-quoted regexes.

Change-Id: Ifac5d576c8b60bbc6d956db85ea05f7db6b37c2b
---
M includes/StringUtils.php
M includes/normal/UtfNormal.php
M includes/search/SearchEngine.php
M languages/Language.php
M languages/classes/LanguageEo.php
5 files changed, 11 insertions(+), 35 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/core 
refs/changes/05/85105/1

diff --git a/includes/StringUtils.php b/includes/StringUtils.php
index 9e21d03..9430e91 100644
--- a/includes/StringUtils.php
+++ b/includes/StringUtils.php
@@ -51,7 +51,7 @@
         */
        static function isUtf8( $value, $disableMbstring = false ) {
                $value = (string)$value;
-               if ( preg_match( "/[\x80-\xff]/S", $value ) === 0 ) {
+               if ( preg_match( '/[\x80-\xff]/S', $value ) === 0 ) {
                        // String contains only ASCII characters, has to be 
valid
                        return true;
                }
@@ -65,7 +65,7 @@
                        }
 
                        return mb_check_encoding( $value, 'UTF-8' ) &&
-                               ( $newPHP || preg_match( 
"/\xf4[\x90-\xbf]|[\xf5-\xff]/S", $value ) === 0 );
+                               ( $newPHP || preg_match( 
'/\xf4[\x90-\xbf]|[\xf5-\xff]/S', $value ) === 0 );
                }
 
                // PCRE implements repetition using recursion; to avoid a stack 
overflow (and segfault)
@@ -74,7 +74,7 @@
                // used rather than a single long regex for performance.
                static $regexes;
                if ( $regexes === null ) {
-                       $cont = "[\x80-\xbf]";
+                       $cont = '[\x80-\xbf]';
                        $after = "(?!$cont)"; // "(?:[^\x80-\xbf]|$)" would 
work here
                        $regexes = array(
                                // Continuation byte at the start
@@ -84,7 +84,7 @@
                                "/[\\x00-\x7f]$cont/S",
 
                                // Illegal byte
-                               "/[\xc0\xc1\xf5-\xff]/S",
+                               '/[\xc0\xc1\xf5-\xff]/S',
 
                                // Invalid 2-byte sequence, or valid one then 
an extra continuation byte
                                "/[\xc2-\xdf](?!$cont$after)/S",
diff --git a/includes/normal/UtfNormal.php b/includes/normal/UtfNormal.php
index 5a091af..5567ce5 100644
--- a/includes/normal/UtfNormal.php
+++ b/includes/normal/UtfNormal.php
@@ -140,7 +140,7 @@
                        return normalizer_normalize( $string, 
Normalizer::FORM_D );
                elseif( NORMALIZE_ICU )
                        return utf8_normalize( $string, self::UNORM_NFD );
-               elseif( preg_match( '/[\x80-\xff]/', $string ) )
+               elseif( preg_match( '/[\x80-\xff]/S', $string ) )
                        return UtfNormal::NFD( $string );
                else
                        return $string;
@@ -159,7 +159,7 @@
                        return normalizer_normalize( $string, 
Normalizer::FORM_KC );
                elseif( NORMALIZE_ICU )
                        return utf8_normalize( $string, self::UNORM_NFKC );
-               elseif( preg_match( '/[\x80-\xff]/', $string ) )
+               elseif( preg_match( '/[\x80-\xff]/S', $string ) )
                        return UtfNormal::NFKC( $string );
                else
                        return $string;
@@ -178,7 +178,7 @@
                        return normalizer_normalize( $string, 
Normalizer::FORM_KD );
                elseif( NORMALIZE_ICU )
                        return utf8_normalize( $string, self::UNORM_NFKD );
-               elseif( preg_match( '/[\x80-\xff]/', $string ) )
+               elseif( preg_match( '/[\x80-\xff]/S', $string ) )
                        return UtfNormal::NFKD( $string );
                else
                        return $string;
@@ -203,7 +203,7 @@
        static function quickIsNFC( $string ) {
                # ASCII is always valid NFC!
                # If it's pure ASCII, let it through.
-               if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
+               if( !preg_match( '/[\x80-\xff]/S', $string ) ) return true;
 
                UtfNormal::loadData();
                $len = strlen( $string );
@@ -247,7 +247,7 @@
                # ASCII is always valid NFC!
                # If we're only ever given plain ASCII, we can avoid the 
overhead
                # of initializing the decomposition tables by skipping out 
early.
-               if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
+               if( !preg_match( '/[\x80-\xff]/S', $string ) ) return true;
 
                static $checkit = null, $tailBytes = null, $utfCheckOrCombining 
= null;
                if( !isset( $checkit ) ) {
diff --git a/includes/search/SearchEngine.php b/includes/search/SearchEngine.php
index e5925fa..741ebee 100644
--- a/includes/search/SearchEngine.php
+++ b/includes/search/SearchEngine.php
@@ -1111,7 +1111,7 @@
                // prepare regexps
                foreach ( $terms as $index => $term ) {
                        // manually do upper/lowercase stuff for utf-8 since 
PHP won't do it
-                       if ( preg_match( '/[\x80-\xff]/', $term ) ) {
+                       if ( preg_match( '/[\x80-\xff]/S', $term ) ) {
                                $terms[$index] = preg_replace_callback( 
'/./us', array( $this, 'caseCallback' ), $terms[$index] );
                        } else {
                                $terms[$index] = $term;
diff --git a/languages/Language.php b/languages/Language.php
index 356726f..38d7a31 100644
--- a/languages/Language.php
+++ b/languages/Language.php
@@ -2519,7 +2519,7 @@
         * @return bool
         */
        function isMultibyte( $str ) {
-               return (bool)preg_match( '/[\x80-\xff]/', $str );
+               return (bool)preg_match( '/[\x80-\xff]/S', $str );
        }
 
        /**
diff --git a/languages/classes/LanguageEo.php b/languages/classes/LanguageEo.php
index 39bdfb5..0e2d8b4 100644
--- a/languages/classes/LanguageEo.php
+++ b/languages/classes/LanguageEo.php
@@ -114,30 +114,6 @@
                return strtr( $matches[1], $xu ) . strtr( $matches[2], $xu );
        }
 
-       /**
-        * @param $s string
-        * @return string
-        */
-       function checkTitleEncoding( $s ) {
-               # Check for X-system backwards-compatibility URLs
-               $ishigh = preg_match( '/[\x80-\xff]/', $s );
-               $isutf = preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
-                       
'[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s );
-
-               if ( $ishigh and !$isutf ) {
-                       # Assume Latin1
-                       $s = utf8_encode( $s );
-               } elseif ( preg_match( 
'/(\xc4[\x88\x89\x9c\x9d\xa4\xa5\xb4\xb5]' .
-                               '|\xc5[\x9c\x9d\xac\xad])/', $s )
-               ) {
-                       return $s;
-               }
-
-               // if( preg_match( '/[cghjsu]x/i', $s ) )
-               //      return $this->iconv( 'x', 'utf-8', $s );
-               return $s;
-       }
-
        function initEncoding() {
                global $wgEditEncoding;
                $wgEditEncoding = 'x';

-- 
To view, visit https://gerrit.wikimedia.org/r/85105
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ifac5d576c8b60bbc6d956db85ea05f7db6b37c2b
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/core
Gerrit-Branch: master
Gerrit-Owner: PleaseStand <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] Add /S modifier to remaining uses of ASCII check regex - change (mediawiki/core)

Reply via email to