Manybubbles has uploaded a new change for review.
https://gerrit.wikimedia.org/r/156699
Change subject: Unicode regexes everywhere!
......................................................................
Unicode regexes everywhere!
Well, almost everywhere.
Bug: 69766
Change-Id: I113f72772938fbb649e2773b8a1832bd12e4eb56
---
M includes/BuildDocument/PageDataBuilder.php
M includes/BuildDocument/PageTextBuilder.php
M includes/ElasticsearchIntermediary.php
M includes/Sanity/Checker.php
M includes/Search/Escaper.php
M includes/Searcher.php
M tests/jenkins/Jenkins.php
7 files changed, 25 insertions(+), 23 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch
refs/changes/99/156699/1
diff --git a/includes/BuildDocument/PageDataBuilder.php
b/includes/BuildDocument/PageDataBuilder.php
index 26409d8..e5ca63a 100644
--- a/includes/BuildDocument/PageDataBuilder.php
+++ b/includes/BuildDocument/PageDataBuilder.php
@@ -105,8 +105,8 @@
// Some wikis wrap the brackets in a span:
//
http://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link
- $heading = preg_replace( '/<\/?span>/', '', $heading );
- $heading = preg_replace(
'/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/', '', $heading );
+ $heading = preg_replace( '/<\/?span>/u', '', $heading );
+ $heading = preg_replace(
'/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/u', '', $heading );
// Strip tags from the heading or else we'll display
them (escaped) in search results
$heading = trim( Sanitizer::stripAllTags( $heading ) );
@@ -127,7 +127,7 @@
$ignoredHeadings = array();
if( !$source->isDisabled() ) {
$lines = explode( "\n", $source->plain() );
- $lines = preg_replace( '/#.*$/', '', $lines );
// Remove comments
+ $lines = preg_replace( '/#.*$/u', '', $lines );
// Remove comments
$lines = array_map( 'trim', $lines );
// Remove extra spaces
$lines = array_filter( $lines );
// Remove empty lines
$ignoredHeadings = $lines; // Now
we just have headings!
diff --git a/includes/BuildDocument/PageTextBuilder.php
b/includes/BuildDocument/PageTextBuilder.php
index 347f96b..8f71315 100644
--- a/includes/BuildDocument/PageTextBuilder.php
+++ b/includes/BuildDocument/PageTextBuilder.php
@@ -135,7 +135,7 @@
private function extractHeadingBeforeFirstHeading( $text ) {
$matches = array();
- if ( !preg_match( '/<h[123456]>/', $text, $matches,
PREG_OFFSET_CAPTURE ) ) {
+ if ( !preg_match( '/<h[123456]>/u', $text, $matches,
PREG_OFFSET_CAPTURE ) ) {
// There isn't a first heading so we interpret this as
the article
// being entirely without heading.
return null;
diff --git a/includes/ElasticsearchIntermediary.php
b/includes/ElasticsearchIntermediary.php
index 2d12671..6896922 100644
--- a/includes/ElasticsearchIntermediary.php
+++ b/includes/ElasticsearchIntermediary.php
@@ -196,7 +196,7 @@
$errorMessage = 'unknown';
$position = 'unknown';
$matches = array();
- if ( preg_match( '/(.+) at position ([0-9]+)/',
$syntaxError, $matches ) ) {
+ if ( preg_match( '/(.+) at position ([0-9]+)/u',
$syntaxError, $matches ) ) {
$errorMessage = $matches[ 1 ];
// The 3 below offsets the .*( in front of the
user pattern to make it unanchored.
$position = $matches[ 2 ] - 3;
diff --git a/includes/Sanity/Checker.php b/includes/Sanity/Checker.php
index e45b854..c360148 100644
--- a/includes/Sanity/Checker.php
+++ b/includes/Sanity/Checker.php
@@ -73,7 +73,7 @@
$expectedType =
Connection::getIndexSuffixForNamespace( $page->getTitle()->getNamespace() );
foreach ( $fromIndex as $indexInfo ) {
$matches = array();
- if ( !preg_match(
'/_(.+)_.+$/', $indexInfo->getIndex(), $matches ) ) {
+ if ( !preg_match(
'/_(.+)_.+$/u', $indexInfo->getIndex(), $matches ) ) {
return
Status::newFatal( "Can't parse index name: " . $indexInfo->getIndex() );
}
$type = $matches[ 1 ];
diff --git a/includes/Search/Escaper.php b/includes/Search/Escaper.php
index f7ec1e5..9615d9e 100644
--- a/includes/Search/Escaper.php
+++ b/includes/Search/Escaper.php
@@ -37,7 +37,7 @@
// character (״), call a Gershayim, which mark
acronyms. Here we guess if the intent
// was to mark a phrase, in which case we leave the
quotes alone, or to mark an
// acronym, in which case we escape them.
- return preg_replace( '/(\S+)"(\S)/', '\1\\"\2', $text );
+ return preg_replace( '/(\S+)"(\S)/u', '\1\\"\2', $text
);
}
return $text;
}
@@ -69,7 +69,7 @@
\^| (?# no user supplied boosts at this
point, though I cant think why)
:| (?# no specifying your own
fields)
\\\(?!") (?# the only acceptable escaping is
for quotes)
- )/x', '\\\$1', $string );
+ )/ux', '\\\$1', $string );
// Forward slash escaping doesn't work properly in all
environments so we just eat them. Nom.
$string = str_replace( '/', ' ', $string );
@@ -89,30 +89,30 @@
// Be careful when editing this method because the ordering of
the replacements matters.
// Escape ~ that don't follow a term or a quote
- $string = preg_replace_callback( '/(?<![\w"])~/',
+ $string = preg_replace_callback( '/(?<![\w"])~/u',
'CirrusSearch\Search\Escaper::escapeBadSyntax', $string
);
// Remove ? and * that don't follow a term. These are slow so
we turned them off and escaping isn't working....
- $string = preg_replace( '/(?<![\w])([?*])/', '', $string );
+ $string = preg_replace( '/(?<![\w])([?*])/u', '', $string );
// Reduce token ranges to bare tokens without the < or >
- $string = preg_replace( '/(?:<|>)([^\s])/', '$1', $string );
+ $string = preg_replace( '/(?:<|>)([^\s])/u', '$1', $string );
// Turn bad fuzzy searches into searches that contain a ~ and
set $this->fuzzyQuery for good ones.
$fuzzyQuery = false;
- $string = preg_replace_callback(
'/(?<leading>\w)~(?<trailing>\S*)/',
+ $string = preg_replace_callback(
'/(?<leading>\w)~(?<trailing>\S*)/u',
function ( $matches ) use ( &$fuzzyQuery ) {
- if ( preg_match(
'/^(?:|0|(?:0?\.[0-9]+)|(?:1(?:\.0)?))$/', $matches[ 'trailing' ] ) ) {
+ if ( preg_match(
'/^(?:|0|(?:0?\.[0-9]+)|(?:1(?:\.0)?))$/u', $matches[ 'trailing' ] ) ) {
$fuzzyQuery = true;
return $matches[ 0 ];
} else {
return $matches[ 'leading' ] . '\\~' .
- preg_replace( '/(?<!\\\\)~/',
'\~', $matches[ 'trailing' ] );
+ preg_replace( '/(?<!\\\\)~/u',
'\~', $matches[ 'trailing' ] );
}
}, $string );
// Turn bad proximity searches into searches that contain a ~
- $string = preg_replace_callback( '/"~(?<trailing>\S*)/',
function ( $matches ) {
+ $string = preg_replace_callback( '/"~(?<trailing>\S*)/u',
function ( $matches ) {
if ( preg_match( '/[0-9]+/', $matches[ 'trailing' ] ) )
{
return $matches[ 0 ];
} else {
@@ -123,22 +123,22 @@
// Escape +, -, and ! when not immediately followed by a term
or when immediately
// prefixed with a term. Catches "foo-bar", "foo- bar", "foo -
bar". The only
// acceptable use is "foo -bar" and "-bar foo".
- $string = preg_replace_callback( '/[+\-!]+(?!\w)/',
+ $string = preg_replace_callback( '/[+\-!]+(?!\w)/u',
'CirrusSearch\Search\Escaper::escapeBadSyntax', $string
);
- $string = preg_replace_callback( '/(?<!^|[ \\\\])[+\-!]+/',
+ $string = preg_replace_callback( '/(?<!^|[ \\\\])[+\-!]+/u',
'CirrusSearch\Search\Escaper::escapeBadSyntax', $string
);
// Escape || when not between terms
- $string = preg_replace_callback( '/^\s*\|\|/',
+ $string = preg_replace_callback( '/^\s*\|\|/u',
'CirrusSearch\Search\Escaper::escapeBadSyntax', $string
);
- $string = preg_replace_callback( '/\|\|\s*$/',
+ $string = preg_replace_callback( '/\|\|\s*$/u',
'CirrusSearch\Search\Escaper::escapeBadSyntax', $string
);
// Lowercase AND and OR when not surrounded on both sides by a
term.
// Lowercase NOT when it doesn't have a term after it.
- $string = preg_replace_callback( '/^\s*(?:AND|OR)/',
+ $string = preg_replace_callback( '/^\s*(?:AND|OR)/u',
'CirrusSearch\Search\Escaper::lowercaseMatched',
$string );
- $string = preg_replace_callback( '/(?:AND|OR|NOT)\s*$/',
+ $string = preg_replace_callback( '/(?:AND|OR|NOT)\s*$/u',
'CirrusSearch\Search\Escaper::lowercaseMatched',
$string );
return array( $string, $fuzzyQuery );
diff --git a/includes/Searcher.php b/includes/Searcher.php
index 4c0020e..3926ce4 100644
--- a/includes/Searcher.php
+++ b/includes/Searcher.php
@@ -567,8 +567,10 @@
// Find prefix matches and force them to only match against the
plain analyzed fields. This
// prevents prefix matches from getting confused by stemming.
Users really don't expect stemming
// in prefix queries.
- $query = self::replaceAllPartsOfQuery( $query,
'/\w+\*(?:\w*\*?)*/',
+ wfDebugLog( 'CirrusSearch', "asdfsdaf " . implode( explode(
"\n", var_export( $query, true ) ) ) );
+ $query = self::replaceAllPartsOfQuery( $query,
'/\w+\*(?:\w*\*?)*/u',
function ( $matches ) use ( $searcher, $escaper ) {
+ wfDebugLog( 'CirrusSearch', "asdfsdaf " .
implode( explode( "\n", var_export( $matches, true ) ) ) );
$term = $escaper->fixupQueryStringPart(
$matches[ 0 ][ 0 ] );
return array(
'escaped' =>
$searcher->switchSearchToExact( $term, false ),
diff --git a/tests/jenkins/Jenkins.php b/tests/jenkins/Jenkins.php
index 390dc96..458670e 100644
--- a/tests/jenkins/Jenkins.php
+++ b/tests/jenkins/Jenkins.php
@@ -124,7 +124,7 @@
*/
public static function setLanguage( $title, &$pageLang, $wgLang ) {
$matches = array();
- if ( preg_match( '/\/..$/', $title->getText(), $matches ) ) {
+ if ( preg_match( '/\/..$/u', $title->getText(), $matches ) ) {
$pageLang = substr( $matches[ 0 ], 1 );
}
return true;
--
To view, visit https://gerrit.wikimedia.org/r/156699
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I113f72772938fbb649e2773b8a1832bd12e4eb56
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits