[MediaWiki-commits] [Gerrit] Unicode regexes everywhere! - change (mediawiki...CirrusSearch)

Manybubbles (Code Review) Wed, 27 Aug 2014 14:19:35 -0700

Manybubbles has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/156699


Change subject: Unicode regexes everywhere!
......................................................................

Unicode regexes everywhere!

Well, almost everywhere.

Bug: 69766

Change-Id: I113f72772938fbb649e2773b8a1832bd12e4eb56
---
M includes/BuildDocument/PageDataBuilder.php
M includes/BuildDocument/PageTextBuilder.php
M includes/ElasticsearchIntermediary.php
M includes/Sanity/Checker.php
M includes/Search/Escaper.php
M includes/Searcher.php
M tests/jenkins/Jenkins.php
7 files changed, 25 insertions(+), 23 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/99/156699/1

diff --git a/includes/BuildDocument/PageDataBuilder.php 
b/includes/BuildDocument/PageDataBuilder.php
index 26409d8..e5ca63a 100644
--- a/includes/BuildDocument/PageDataBuilder.php
+++ b/includes/BuildDocument/PageDataBuilder.php
@@ -105,8 +105,8 @@
 
                        // Some wikis wrap the brackets in a span:
                        // 
http://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link
-                       $heading = preg_replace( '/<\/?span>/', '', $heading );
-                       $heading = preg_replace( 
'/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/', '', $heading );
+                       $heading = preg_replace( '/<\/?span>/u', '', $heading );
+                       $heading = preg_replace( 
'/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/u', '', $heading );
 
                        // Strip tags from the heading or else we'll display 
them (escaped) in search results
                        $heading = trim( Sanitizer::stripAllTags( $heading ) );
@@ -127,7 +127,7 @@
                        $ignoredHeadings = array();
                        if( !$source->isDisabled() ) {
                                $lines = explode( "\n", $source->plain() );
-                               $lines = preg_replace( '/#.*$/', '', $lines ); 
// Remove comments
+                               $lines = preg_replace( '/#.*$/u', '', $lines ); 
// Remove comments
                                $lines = array_map( 'trim', $lines );          
// Remove extra spaces
                                $lines = array_filter( $lines );               
// Remove empty lines
                                $ignoredHeadings = $lines;               // Now 
we just have headings!
diff --git a/includes/BuildDocument/PageTextBuilder.php 
b/includes/BuildDocument/PageTextBuilder.php
index 347f96b..8f71315 100644
--- a/includes/BuildDocument/PageTextBuilder.php
+++ b/includes/BuildDocument/PageTextBuilder.php
@@ -135,7 +135,7 @@
 
        private function extractHeadingBeforeFirstHeading( $text ) {
                $matches = array();
-               if ( !preg_match( '/<h[123456]>/', $text, $matches, 
PREG_OFFSET_CAPTURE ) ) {
+               if ( !preg_match( '/<h[123456]>/u', $text, $matches, 
PREG_OFFSET_CAPTURE ) ) {
                        // There isn't a first heading so we interpret this as 
the article
                        // being entirely without heading.
                        return null;
diff --git a/includes/ElasticsearchIntermediary.php 
b/includes/ElasticsearchIntermediary.php
index 2d12671..6896922 100644
--- a/includes/ElasticsearchIntermediary.php
+++ b/includes/ElasticsearchIntermediary.php
@@ -196,7 +196,7 @@
                        $errorMessage = 'unknown';
                        $position = 'unknown';
                        $matches = array();
-                       if ( preg_match( '/(.+) at position ([0-9]+)/', 
$syntaxError, $matches ) ) {
+                       if ( preg_match( '/(.+) at position ([0-9]+)/u', 
$syntaxError, $matches ) ) {
                                $errorMessage = $matches[ 1 ];
                                // The 3 below offsets the .*( in front of the 
user pattern to make it unanchored.
                                $position = $matches[ 2 ] - 3;
diff --git a/includes/Sanity/Checker.php b/includes/Sanity/Checker.php
index e45b854..c360148 100644
--- a/includes/Sanity/Checker.php
+++ b/includes/Sanity/Checker.php
@@ -73,7 +73,7 @@
                                        $expectedType = 
Connection::getIndexSuffixForNamespace( $page->getTitle()->getNamespace() );
                                        foreach ( $fromIndex as $indexInfo ) {
                                                $matches = array();
-                                               if ( !preg_match( 
'/_(.+)_.+$/', $indexInfo->getIndex(), $matches ) ) {
+                                               if ( !preg_match( 
'/_(.+)_.+$/u', $indexInfo->getIndex(), $matches ) ) {
                                                        return 
Status::newFatal( "Can't parse index name:  " . $indexInfo->getIndex() );
                                                }
                                                $type = $matches[ 1 ];
diff --git a/includes/Search/Escaper.php b/includes/Search/Escaper.php
index f7ec1e5..9615d9e 100644
--- a/includes/Search/Escaper.php
+++ b/includes/Search/Escaper.php
@@ -37,7 +37,7 @@
                        // character (״), call a Gershayim, which mark 
acronyms.  Here we guess if the intent
                        // was to mark a phrase, in which case we leave the 
quotes alone, or to mark an
                        // acronym, in which case we escape them.
-                       return preg_replace( '/(\S+)"(\S)/', '\1\\"\2', $text );
+                       return preg_replace( '/(\S+)"(\S)/u', '\1\\"\2', $text 
);
                }
                return $text;
        }
@@ -69,7 +69,7 @@
                                \^|     (?# no user supplied boosts at this 
point, though I cant think why)
                                :|              (?# no specifying your own 
fields)
                                \\\(?!") (?# the only acceptable escaping is 
for quotes)
-                       )/x', '\\\$1', $string );
+                       )/ux', '\\\$1', $string );
                // Forward slash escaping doesn't work properly in all 
environments so we just eat them.   Nom.
                $string = str_replace( '/', ' ', $string );
 
@@ -89,30 +89,30 @@
                // Be careful when editing this method because the ordering of 
the replacements matters.
 
                // Escape ~ that don't follow a term or a quote
-               $string = preg_replace_callback( '/(?<![\w"])~/',
+               $string = preg_replace_callback( '/(?<![\w"])~/u',
                        'CirrusSearch\Search\Escaper::escapeBadSyntax', $string 
);
 
                // Remove ? and * that don't follow a term.  These are slow so 
we turned them off and escaping isn't working....
-               $string = preg_replace( '/(?<![\w])([?*])/', '', $string );
+               $string = preg_replace( '/(?<![\w])([?*])/u', '', $string );
 
                // Reduce token ranges to bare tokens without the < or >
-               $string = preg_replace( '/(?:<|>)([^\s])/', '$1', $string );
+               $string = preg_replace( '/(?:<|>)([^\s])/u', '$1', $string );
 
                // Turn bad fuzzy searches into searches that contain a ~ and 
set $this->fuzzyQuery for good ones.
                $fuzzyQuery = false;
-               $string = preg_replace_callback( 
'/(?<leading>\w)~(?<trailing>\S*)/',
+               $string = preg_replace_callback( 
'/(?<leading>\w)~(?<trailing>\S*)/u',
                        function ( $matches ) use ( &$fuzzyQuery ) {
-                               if ( preg_match( 
'/^(?:|0|(?:0?\.[0-9]+)|(?:1(?:\.0)?))$/', $matches[ 'trailing' ] ) ) {
+                               if ( preg_match( 
'/^(?:|0|(?:0?\.[0-9]+)|(?:1(?:\.0)?))$/u', $matches[ 'trailing' ] ) ) {
                                        $fuzzyQuery = true;
                                        return $matches[ 0 ];
                                } else {
                                        return $matches[ 'leading' ] . '\\~' .
-                                               preg_replace( '/(?<!\\\\)~/', 
'\~', $matches[ 'trailing' ] );
+                                               preg_replace( '/(?<!\\\\)~/u', 
'\~', $matches[ 'trailing' ] );
                                }
                        }, $string );
 
                // Turn bad proximity searches into searches that contain a ~
-               $string = preg_replace_callback( '/"~(?<trailing>\S*)/', 
function ( $matches ) {
+               $string = preg_replace_callback( '/"~(?<trailing>\S*)/u', 
function ( $matches ) {
                        if ( preg_match( '/[0-9]+/', $matches[ 'trailing' ] ) ) 
{
                                return $matches[ 0 ];
                        } else {
@@ -123,22 +123,22 @@
                // Escape +, -, and ! when not immediately followed by a term 
or when immediately
                // prefixed with a term.  Catches "foo-bar", "foo- bar", "foo - 
bar".  The only
                // acceptable use is "foo -bar" and "-bar foo".
-               $string = preg_replace_callback( '/[+\-!]+(?!\w)/',
+               $string = preg_replace_callback( '/[+\-!]+(?!\w)/u',
                        'CirrusSearch\Search\Escaper::escapeBadSyntax', $string 
);
-               $string = preg_replace_callback( '/(?<!^|[ \\\\])[+\-!]+/',
+               $string = preg_replace_callback( '/(?<!^|[ \\\\])[+\-!]+/u',
                        'CirrusSearch\Search\Escaper::escapeBadSyntax', $string 
);
 
                // Escape || when not between terms
-               $string = preg_replace_callback( '/^\s*\|\|/',
+               $string = preg_replace_callback( '/^\s*\|\|/u',
                        'CirrusSearch\Search\Escaper::escapeBadSyntax', $string 
);
-               $string = preg_replace_callback( '/\|\|\s*$/',
+               $string = preg_replace_callback( '/\|\|\s*$/u',
                        'CirrusSearch\Search\Escaper::escapeBadSyntax', $string 
);
 
                // Lowercase AND and OR when not surrounded on both sides by a 
term.
                // Lowercase NOT when it doesn't have a term after it.
-               $string = preg_replace_callback( '/^\s*(?:AND|OR)/',
+               $string = preg_replace_callback( '/^\s*(?:AND|OR)/u',
                        'CirrusSearch\Search\Escaper::lowercaseMatched', 
$string );
-               $string = preg_replace_callback( '/(?:AND|OR|NOT)\s*$/',
+               $string = preg_replace_callback( '/(?:AND|OR|NOT)\s*$/u',
                        'CirrusSearch\Search\Escaper::lowercaseMatched', 
$string );
 
                return array( $string, $fuzzyQuery );
diff --git a/includes/Searcher.php b/includes/Searcher.php
index 4c0020e..3926ce4 100644
--- a/includes/Searcher.php
+++ b/includes/Searcher.php
@@ -567,8 +567,10 @@
                // Find prefix matches and force them to only match against the 
plain analyzed fields.  This
                // prevents prefix matches from getting confused by stemming.  
Users really don't expect stemming
                // in prefix queries.
-               $query = self::replaceAllPartsOfQuery( $query, 
'/\w+\*(?:\w*\*?)*/',
+               wfDebugLog( 'CirrusSearch', "asdfsdaf " . implode( explode( 
"\n", var_export( $query, true ) ) ) );
+               $query = self::replaceAllPartsOfQuery( $query, 
'/\w+\*(?:\w*\*?)*/u',
                        function ( $matches ) use ( $searcher, $escaper ) {
+                               wfDebugLog( 'CirrusSearch', "asdfsdaf " . 
implode( explode( "\n", var_export( $matches, true ) ) ) );
                                $term = $escaper->fixupQueryStringPart( 
$matches[ 0 ][ 0 ] );
                                return array(
                                        'escaped' => 
$searcher->switchSearchToExact( $term, false ),
diff --git a/tests/jenkins/Jenkins.php b/tests/jenkins/Jenkins.php
index 390dc96..458670e 100644
--- a/tests/jenkins/Jenkins.php
+++ b/tests/jenkins/Jenkins.php
@@ -124,7 +124,7 @@
         */
        public static function setLanguage( $title, &$pageLang, $wgLang ) {
                $matches = array();
-               if ( preg_match( '/\/..$/', $title->getText(), $matches ) ) {
+               if ( preg_match( '/\/..$/u', $title->getText(), $matches ) ) {
                        $pageLang = substr( $matches[ 0 ], 1 );
                }
                return true;

-- 
To view, visit https://gerrit.wikimedia.org/r/156699
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I113f72772938fbb649e2773b8a1832bd12e4eb56
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] Unicode regexes everywhere! - change (mediawiki...CirrusSearch)

Reply via email to