Anomie has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/322729

Change subject: Use new externallinks.el_index_60 field
......................................................................

Use new externallinks.el_index_60 field

This adds a method to LinkFilter to build the query conditions necessary
to properly use it, and adjusts code to use it.

This also takes the opportunity to clean up the calculation of el_index:
IPs are handled more sensibly and IDNs are canonicalized.

Bug: T59176
Bug: T130482
Change-Id: I84d224ef23de22dfe179009ec3a11fd0e4b5f56d
---
M autoload.php
M includes/GlobalFunctions.php
M includes/LinkFilter.php
M includes/api/ApiQueryBase.php
M includes/api/ApiQueryExtLinksUsage.php
M includes/api/ApiQueryExternalLinks.php
M includes/deferred/LinksUpdate.php
M includes/installer/DatabaseUpdater.php
M includes/parser/Parser.php
M includes/specials/SpecialLinkSearch.php
M maintenance/cleanupSpam.php
M maintenance/deleteSelfExternals.php
A maintenance/refreshExternallinksIndex.php
M tests/phpunit/includes/GlobalFunctions/GlobalTest.php
M tests/phpunit/includes/LinkFilterTest.php
M tests/phpunit/includes/parser/ParserMethodsTest.php
16 files changed, 705 insertions(+), 225 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/core 
refs/changes/29/322729/1

diff --git a/autoload.php b/autoload.php
index f9cb837..bad2635 100644
--- a/autoload.php
+++ b/autoload.php
@@ -1159,6 +1159,7 @@
        'RedisConnectionPool' => __DIR__ . 
'/includes/libs/redis/RedisConnectionPool.php',
        'RedisLockManager' => __DIR__ . 
'/includes/libs/lockmanager/RedisLockManager.php',
        'RedisPubSubFeedEngine' => __DIR__ . 
'/includes/rcfeed/RedisPubSubFeedEngine.php',
+       'RefreshExternallinksIndex' => __DIR__ . 
'/maintenance/refreshExternallinksIndex.php',
        'RefreshFileHeaders' => __DIR__ . '/maintenance/refreshFileHeaders.php',
        'RefreshImageMetadata' => __DIR__ . 
'/maintenance/refreshImageMetadata.php',
        'RefreshLinks' => __DIR__ . '/maintenance/refreshLinks.php',
diff --git a/includes/GlobalFunctions.php b/includes/GlobalFunctions.php
index b3ccc56..ba7ff78 100644
--- a/includes/GlobalFunctions.php
+++ b/includes/GlobalFunctions.php
@@ -904,55 +904,13 @@
 /**
  * Make URL indexes, appropriate for the el_index field of externallinks.
  *
+ * @deprecated since 1.29, use LinkFilter::makeIndexes() instead
  * @param string $url
  * @return array
  */
 function wfMakeUrlIndexes( $url ) {
-       $bits = wfParseUrl( $url );
-
-       // Reverse the labels in the hostname, convert to lower case
-       // For emails reverse domainpart only
-       if ( $bits['scheme'] == 'mailto' ) {
-               $mailparts = explode( '@', $bits['host'], 2 );
-               if ( count( $mailparts ) === 2 ) {
-                       $domainpart = strtolower( implode( '.', array_reverse( 
explode( '.', $mailparts[1] ) ) ) );
-               } else {
-                       // No domain specified, don't mangle it
-                       $domainpart = '';
-               }
-               $reversedHost = $domainpart . '@' . $mailparts[0];
-       } else {
-               $reversedHost = strtolower( implode( '.', array_reverse( 
explode( '.', $bits['host'] ) ) ) );
-       }
-       // Add an extra dot to the end
-       // Why? Is it in wrong place in mailto links?
-       if ( substr( $reversedHost, -1, 1 ) !== '.' ) {
-               $reversedHost .= '.';
-       }
-       // Reconstruct the pseudo-URL
-       $prot = $bits['scheme'];
-       $index = $prot . $bits['delimiter'] . $reversedHost;
-       // Leave out user and password. Add the port, path, query and fragment
-       if ( isset( $bits['port'] ) ) {
-               $index .= ':' . $bits['port'];
-       }
-       if ( isset( $bits['path'] ) ) {
-               $index .= $bits['path'];
-       } else {
-               $index .= '/';
-       }
-       if ( isset( $bits['query'] ) ) {
-               $index .= '?' . $bits['query'];
-       }
-       if ( isset( $bits['fragment'] ) ) {
-               $index .= '#' . $bits['fragment'];
-       }
-
-       if ( $prot == '' ) {
-               return [ "http:$index", "https:$index" ];
-       } else {
-               return [ $index ];
-       }
+       wfDeprecated( __FUNCTION__, '1.29' );
+       return LinkFilter::makeIndexes( $url );
 }
 
 /**
diff --git a/includes/LinkFilter.php b/includes/LinkFilter.php
index 7b3d72b..a3496b6 100644
--- a/includes/LinkFilter.php
+++ b/includes/LinkFilter.php
@@ -31,6 +31,11 @@
  * Another cool thing to do would be a web interface for fast spam removal.
  */
 class LinkFilter {
+       /**
+        * Increment this when makeIndexes output changes. It'll cause
+        * maintenance/refreshExternallinksIndex.php to run from update.php.
+        */
+       const VERSION = 1;
 
        /**
         * Check whether $content contains a link to $filterEntry
@@ -56,6 +61,7 @@
        /**
         * Builds a regex pattern for $filterEntry.
         *
+        * @todo This doesn't match the rest of it.
         * @param string $filterEntry URL, if it begins with "*.", it'll be
         *        replaced to match any subdomain
         * @return string Regex pattern, for preg_match()
@@ -68,6 +74,200 @@
                }
                $regex .= preg_quote( $filterEntry, '!' ) . '!Si';
                return $regex;
+       }
+
+       /**
+        * Canonicalize a hostname for el_index
+        * @param string $hose
+        * @return string
+        */
+       private static function indexifyHost( $host ) {
+               // Canonicalize
+               $host = preg_replace_callback(
+                       '<[^a-zA-Z0-9\\-._~!$&\'()*+,;=]>',
+                       function ( $m ) {
+                               return rawurlencode( $m[0] );
+                       },
+                       strtolower( idn_to_utf8( rawurldecode( $host ) ) )
+               );
+
+               // IPv6? RFC 3986 syntax.
+               if ( preg_match( '/^\[([0-9a-f:*]+)\]$/', rawurldecode( $host 
), $m ) ) {
+                       $ip = $m[1];
+                       if ( IP::isValid( $ip ) ) {
+                               return 'V6.' . implode( '.', explode( ':', 
IP::sanitizeIP( $ip ) ) ) . '.';
+                       }
+                       if ( substr( $ip, -2 ) === ':*' ) {
+                               $cutIp = substr( $ip, 0, -2 );
+                               if ( IP::isValid( "{$cutIp}::" ) ) {
+                                       $ct = count( explode( ':', $ip ) ) - 1;
+                                       return 'V6.' .
+                                               implode( '.', array_slice( 
explode( ':', IP::sanitizeIP( "{$cutIp}::" ) ), 0, $ct ) ) .
+                                               '.*.';
+                               }
+                               if ( IP::isValid( "{$cutIp}:1" ) ) {
+                                       return 'V6.' .
+                                               substr( implode( '.', explode( 
':', IP::sanitizeIP( "{$cutIp}:1" ) ) ), 0, -1 ) .
+                                               '*.';
+                               }
+                       }
+               }
+
+               // IPv4?
+               $b = '(?:0*25[0-5]|0*2[0-4][0-9]|0*1[0-9][0-9]|0*[0-9]?[0-9])';
+               if ( preg_match( "/^(?:{$b}\.){3}{$b}$|^(?:{$b}\.){1,3}\*$/", 
$host ) ) {
+                       return 'V4.' . implode( '.', array_map( function ( $v ) 
{
+                               return $v === '*' ? $v : (int)$v;
+                       }, explode( '.', $host ) ) ) . '.';
+               }
+
+               // Must be a host name.
+               return implode( '.', array_reverse( explode( '.', $host ) ) ) . 
'.';
+       }
+
+       /**
+        * Converts a URL into a format for el_index
+        * @since 1.29
+        * @param string $url
+        * @return string[] Usually one entry, but might be two in case of
+        *  protocol-relative URLs. Empty array on error.
+        */
+       public static function makeIndexes( $url ) {
+               $bits = wfParseUrl( $url );
+               if ( !$bits ) {
+                       return [];
+               }
+
+               // Reverse the labels in the hostname, convert to lower case, 
unless it's an IP.
+               // For emails turn it into "domain.reversed@localpart"
+               if ( $bits['scheme'] == 'mailto' ) {
+                       $mailparts = explode( '@', $bits['host'], 2 );
+                       if ( count( $mailparts ) === 2 ) {
+                               $domainpart = self::indexifyHost( $mailparts[1] 
);
+                       } else {
+                               // No @, assume it's a local part with no domain
+                               $domainpart = '';
+                       }
+                       $bits['host'] = $domainpart . '@' . $mailparts[0];
+               } else {
+                       $bits['host'] = self::indexifyHost( $bits['host'] );
+               }
+
+               // Reconstruct the pseudo-URL
+               $index = $bits['scheme'] . $bits['delimiter'] . $bits['host'];
+               // Leave out user and password. Add the port, path, query and 
fragment
+               if ( isset( $bits['port'] ) ) {
+                       $index .= ':' . $bits['port'];
+               }
+               if ( isset( $bits['path'] ) ) {
+                       $index .= $bits['path'];
+               } else {
+                       $index .= '/';
+               }
+               if ( isset( $bits['query'] ) ) {
+                       $index .= '?' . $bits['query'];
+               }
+               if ( isset( $bits['fragment'] ) ) {
+                       $index .= '#' . $bits['fragment'];
+               }
+
+               if ( $bits['scheme'] == '' ) {
+                       return [ "http:$index", "https:$index" ];
+               } else {
+                       return [ $index ];
+               }
+       }
+
+       /**
+        * Return query conditions which will match the specified string. There 
are
+        * several kinds of filter entry:
+        *
+        *     *.domain.com    -  Matches domain.com and www.domain.com
+        *     domain.com      -  Matches domain.com or domain.com/ but not 
www.domain.com
+        *     *.domain.com/x  -  Matches domain.com/xy or www.domain.com/xy. 
Also probably matches
+        *                        domain.com/foobar/xy due to limitations of 
LIKE syntax.
+        *     domain.com/x    -  Matches domain.com/xy but not 
www.domain.com/xy
+        *
+        * Asterisks in any other location are considered invalid.
+        *
+        * @since 1.29
+        * @param string $filterEntry Filter entry, as described above
+        * @param array $options Options are:
+        *   - protocol: (string) Protocol to query (default http://)
+        *   - oneWildcard: (bool) Stop at the first wildcard (default false)
+        *   - prefix: (string) Field prefix (default 'el'). The query will test
+        *     fields '{$prefix}_index' and '{$prefix}_index_60'
+        *   - db: (IDatabase|null) Database to use.
+        * @return array|bool Conditions to be used for the query (to be ANDed) 
or
+        *  false on error. To determine if the query is constant on the
+        *  el_index_60 field, check whether key 'el_index_60' is set.
+        */
+       public static function getQueryConditions( $filterEntry, array $options 
= [] ) {
+               $options += [
+                       'protocol' => 'http://',
+                       'oneWildcard' => false,
+                       'prefix' => 'el',
+                       'db' => null,
+               ];
+
+               // First, get the like array
+               $like = self::makeLikeArray( $filterEntry, $options['protocol'] 
);
+               if ( $like === false ) {
+                       return $like;
+               }
+
+               // Get the constant prefix (i.e. everything up to the first 
wildcard)
+               $trimmedLike = self::keepOneWildcard( $like );
+               if ( $options['oneWildcard'] ) {
+                       $like = $trimmedLike;
+               }
+               if ( $trimmedLike[count( $trimmedLike ) - 1] instanceof 
LikeMatch ) {
+                       array_pop( $trimmedLike );
+               }
+               $index = implode( '', $trimmedLike );
+
+               $p = $options['prefix'];
+               $db = $options['db'] ?: wfGetDB( DB_REPLICA );
+
+               // Build the query
+               $l = strlen( $index );
+               if ( $l >= 60 ) {
+                       // The constant prefix is larger than el_index_60, so 
we can use a
+                       // constant comparison.
+                       return [
+                               "{$p}_index_60" => substr( $index, 0, 60 ),
+                               "{$p}_index" . $db->buildLike( $like ),
+                       ];
+               }
+
+               // The constant prefix is smaller than el_index_60, so we make 
a range query on
+               // "$index <= el_index_60 < $index+1", where $index+1 is 
calculated
+               // by treating $index as a big-endian base-256 number.
+               $indexEnd = $index;
+               while ( --$l >= 0 ) {
+                       if ( $indexEnd[$l] === "\xff" ) {
+                               // FF + 1 = 00 with a carry
+                               $indexEnd[$l] === "\x00";
+                       } else {
+                               // No carry, stop here.
+                               $indexEnd[$l] = chr( ord( $indexEnd[$l] ) + 1 );
+                               break;
+                       }
+               }
+
+               // Overflow?
+               if ( $l < 0 ) {
+                       return [
+                               "{$p}_index_60 >= " . $db->addQuotes( $index ),
+                               "{$p}_index" . $db->buildLike( $like ),
+                       ];
+               }
+
+               return [
+                       "{$p}_index_60 >= " . $db->addQuotes( $index ),
+                       "{$p}_index_60 < " . $db->addQuotes( $indexEnd ),
+                       "{$p}_index" . $db->buildLike( $like ),
+               ];
        }
 
        /**
@@ -87,6 +287,7 @@
         * This function does the same as wfMakeUrlIndexes(), except it also 
takes care
         * of adding wildcards
         *
+        * @note You probably want self::getQueryConditions() instead
         * @param string $filterEntry Domainparts
         * @param string $protocol Protocol (default http://)
         * @return array|bool Array to be passed to Database::buildLike() or 
false on error
@@ -96,38 +297,26 @@
 
                $target = $protocol . $filterEntry;
                $bits = wfParseUrl( $target );
-
-               if ( $bits == false ) {
-                       // Unknown protocol?
+               if ( !$bits ) {
                        return false;
                }
 
-               if ( substr( $bits['host'], 0, 2 ) == '*.' ) {
-                       $subdomains = true;
-                       $bits['host'] = substr( $bits['host'], 2 );
-                       if ( $bits['host'] == '' ) {
-                               // We don't want to make a clause that will 
match everything,
-                               // that could be dangerous
-                               return false;
+               $subdomains = false;
+               if ( $bits['scheme'] === 'mailto' && strpos( $bits['host'], '@' 
) ) {
+                       // Email address with domain and non-empty local part
+                       $mailparts = explode( '@', $bits['host'], 2 );
+                       if ( $mailparts[0] === '*' ) {
+                               $subdomains = true;
+                               $bits['host'] = self::indexifyHost( 
$mailparts[1] ) . '@';
+                       } else {
+                               $bits['host'] = self::indexifyHost( 
$mailparts[1] ) . '@' . $mailparts[0];
                        }
                } else {
-                       $subdomains = false;
-               }
-
-               // Reverse the labels in the hostname, convert to lower case
-               // For emails reverse domainpart only
-               if ( $bits['scheme'] === 'mailto' && strpos( $bits['host'], '@' 
) ) {
-                       // complete email address
-                       $mailparts = explode( '@', $bits['host'] );
-                       $domainpart = strtolower( implode( '.', array_reverse( 
explode( '.', $mailparts[1] ) ) ) );
-                       $bits['host'] = $domainpart . '@' . $mailparts[0];
-               } elseif ( $bits['scheme'] === 'mailto' ) {
-                       // domainpart of email address only, do not add '.'
-                       $bits['host'] = strtolower( implode( '.', 
array_reverse( explode( '.', $bits['host'] ) ) ) );
-               } else {
-                       $bits['host'] = strtolower( implode( '.', 
array_reverse( explode( '.', $bits['host'] ) ) ) );
-                       if ( substr( $bits['host'], -1, 1 ) !== '.' ) {
-                               $bits['host'] .= '.';
+                       // Non-email, or email with only a domain part.
+                       $bits['host'] = self::indexifyHost( $bits['host'] );
+                       if ( substr( $bits['host'], -3 ) === '.*.' ) {
+                               $subdomains = true;
+                               $bits['host'] = substr( $bits['host'], 0, -2 );
                        }
                }
 
@@ -171,6 +360,7 @@
         * Filters an array returned by makeLikeArray(), removing everything 
past first
         * pattern placeholder.
         *
+        * @note You probably want self::getQueryConditions() instead
         * @param array $arr Array to filter
         * @return array Filtered array
         */
diff --git a/includes/api/ApiQueryBase.php b/includes/api/ApiQueryBase.php
index bba5375..6fcf8cf 100644
--- a/includes/api/ApiQueryBase.php
+++ b/includes/api/ApiQueryBase.php
@@ -408,13 +408,15 @@
        }
 
        /**
+        * @deprecated since 1.29, use LinkFilter::getQueryConditions() instead
         * @param string $query
         * @param string $protocol
         * @return null|string
         */
        public function prepareUrlQuerySearchString( $query = null, $protocol = 
null ) {
+               wfDeprecated( __METHOD__, '1.29' );
                $db = $this->getDB();
-               if ( !is_null( $query ) || $query != '' ) {
+               if ( $query !== null && $query !== '' ) {
                        if ( is_null( $protocol ) ) {
                                $protocol = 'http://';
                        }
diff --git a/includes/api/ApiQueryExtLinksUsage.php 
b/includes/api/ApiQueryExtLinksUsage.php
index 9b05537..bcddfe9 100644
--- a/includes/api/ApiQueryExtLinksUsage.php
+++ b/includes/api/ApiQueryExtLinksUsage.php
@@ -51,12 +51,12 @@
         */
        private function run( $resultPageSet = null ) {
                $params = $this->extractRequestParams();
+               $db = $this->getDB();
 
                $query = $params['query'];
                $protocol = self::getProtocolPrefix( $params['protocol'] );
 
-               $this->addTables( [ 'page', 'externallinks' ] ); // must be in 
this order for 'USE INDEX'
-               $this->addOption( 'USE INDEX', 'el_index' );
+               $this->addTables( [ 'page', 'externallinks' ] );
                $this->addWhere( 'page_id=el_from' );
 
                $miser_ns = [];
@@ -69,11 +69,38 @@
                // Normalize query to match the normalization applied for the 
externallinks table
                $query = Parser::normalizeLinkUrl( $query );
 
-               $whereQuery = $this->prepareUrlQuerySearchString( $query, 
$protocol );
+               $orderBy = [];
 
-               if ( $whereQuery !== null ) {
-                       $this->addWhere( $whereQuery );
+               if ( $query !== null && $query !== '' ) {
+                       if ( $protocol === null ) {
+                               $protocol = 'http://';
+                       }
+                       $conds = LinkFilter::getQueryConditions( $query,
+                               [ 'protocol' => $protocol, 'oneWildcard' => 
true, 'db' => $db ] );
+                       if ( !$conds ) {
+                                $this->dieUsage( 'Invalid query', 'bad_query' 
);
+                       }
+                       $this->addWhere( $conds );
+                       if ( !isset( $conds['el_index_60'] ) ) {
+                               $orderBy[] = 'el_index_60';
+                       }
+               } else {
+                       $orderBy[] = 'el_index_60';
+
+                       if ( $protocol !== null ) {
+                               $this->addWhere( 'el_index_60' . 
$db->buildLike( "$protocol", $db->anyString() ) );
+                       } else {
+                               // We're querying all protocols, filter out 
duplicate protocol-relative links
+                               $this->addWhere( $db->makeList( [
+                                       'el_to NOT' . $db->buildLike( '//', 
$db->anyString() ),
+                                       'el_index_60 ' . $db->buildLike( 
'http://', $db->anyString() ),
+                               ], LIST_OR ) );
+                       }
                }
+
+               $orderBy[] = 'el_id';
+               $this->addOption( 'ORDER BY', $orderBy );
+               $this->addFields( $orderBy ); // Make sure
 
                $prop = array_flip( $params['prop'] );
                $fld_ids = isset( $prop['ids'] );
@@ -92,10 +119,19 @@
                }
 
                $limit = $params['limit'];
-               $offset = $params['offset'];
                $this->addOption( 'LIMIT', $limit + 1 );
-               if ( isset( $offset ) ) {
-                       $this->addOption( 'OFFSET', $offset );
+
+               if ( $params['continue'] !== null ) {
+                       $cont = explode( '|', $params['continue'] );
+                       $this->dieContinueUsageIf( count( $cont ) !== count( 
$orderBy ) );
+                       $i = count( $cont ) - 1;
+                       $cond = $orderBy[$i] . ' >= ' . $db->addQuotes( 
rawurldecode( $cont[$i] ) );
+                       while ( $i-- > 0 ) {
+                               $field = $orderBy[$i];
+                               $v = $db->addQuotes( rawurldecode( $cont[$i] ) 
);
+                               $cond = "($field > $v OR ($field = $v AND 
$cond))";
+                       }
+                       $this->addWhere( $cond );
                }
 
                $res = $this->select( __METHOD__ );
@@ -106,7 +142,7 @@
                        if ( ++$count > $limit ) {
                                // We've reached the one extra which shows that 
there are
                                // additional pages to be had. Stop here...
-                               $this->setContinueEnumParameter( 'offset', 
$offset + $limit );
+                               $this->setContinue( $orderBy, $row );
                                break;
                        }
 
@@ -135,7 +171,7 @@
                                }
                                $fit = $result->addValue( [ 'query', 
$this->getModuleName() ], null, $vals );
                                if ( !$fit ) {
-                                       $this->setContinueEnumParameter( 
'offset', $offset + $count - 1 );
+                                       $this->setContinue( $orderBy, $row );
                                        break;
                                }
                        } else {
@@ -147,6 +183,14 @@
                        $result->addIndexedTagName( [ 'query', 
$this->getModuleName() ],
                                $this->getModulePrefix() );
                }
+       }
+
+       private function setContinue( $orderBy, $row ) {
+               $fields = [];
+               foreach ( $orderBy as $field ) {
+                       $fields[] = rawurlencode( $row->$field );
+               }
+               $this->setContinueEnumParameter( 'continue', implode( '|', 
$fields ) );
        }
 
        public function getAllowedParams() {
@@ -161,8 +205,7 @@
                                ],
                                ApiBase::PARAM_HELP_MSG_PER_VALUE => [],
                        ],
-                       'offset' => [
-                               ApiBase::PARAM_TYPE => 'integer',
+                       'continue' => [
                                ApiBase::PARAM_HELP_MSG => 
'api-help-param-continue',
                        ],
                        'protocol' => [
diff --git a/includes/api/ApiQueryExternalLinks.php 
b/includes/api/ApiQueryExternalLinks.php
index 8c9c887..30d2997 100644
--- a/includes/api/ApiQueryExternalLinks.php
+++ b/includes/api/ApiQueryExternalLinks.php
@@ -41,6 +41,7 @@
                }
 
                $params = $this->extractRequestParams();
+               $db = $this->getDB();
 
                $query = $params['query'];
                $protocol = ApiQueryExtLinksUsage::getProtocolPrefix( 
$params['protocol'] );
@@ -53,26 +54,55 @@
                $this->addTables( 'externallinks' );
                $this->addWhereFld( 'el_from', array_keys( 
$this->getPageSet()->getGoodTitles() ) );
 
-               $whereQuery = $this->prepareUrlQuerySearchString( $query, 
$protocol );
-
-               if ( $whereQuery !== null ) {
-                       $this->addWhere( $whereQuery );
-               }
-
-               // Don't order by el_from if it's constant in the WHERE clause
+               $orderBy = [];
                if ( count( $this->getPageSet()->getGoodTitles() ) != 1 ) {
-                       $this->addOption( 'ORDER BY', 'el_from' );
+                       $orderBy[] = 'el_from';
                }
 
-               // If we're querying all protocols, use DISTINCT to avoid 
repeating protocol-relative links twice
-               if ( $protocol === null ) {
-                       $this->addOption( 'DISTINCT' );
+               if ( $query !== null && $query !== '' ) {
+                       if ( $protocol === null ) {
+                               $protocol = 'http://';
+                       }
+                       $conds = LinkFilter::getQueryConditions( $query,
+                               [ 'protocol' => $protocol, 'oneWildcard' => 
true, 'db' => $db ] );
+                       if ( !$conds ) {
+                                $this->dieUsage( 'Invalid query', 'bad_query' 
);
+                       }
+                       $this->addWhere( $conds );
+                       if ( !isset( $conds['el_index_60'] ) ) {
+                               $orderBy[] = 'el_index_60';
+                       }
+               } else {
+                       $orderBy[] = 'el_index_60';
+
+                       if ( $protocol !== null ) {
+                               $this->addWhere( 'el_index_60' . 
$db->buildLike( "$protocol", $db->anyString() ) );
+                       } else {
+                               // We're querying all protocols, filter out 
duplicate protocol-relative links
+                               $this->addWhere( $db->makeList( [
+                                       'el_to NOT' . $db->buildLike( '//', 
$db->anyString() ),
+                                       'el_index_60 ' . $db->buildLike( 
'http://', $db->anyString() ),
+                               ], LIST_OR ) );
+                       }
                }
+
+               $orderBy[] = 'el_id';
+               $this->addOption( 'ORDER BY', $orderBy );
+               $this->addFields( $orderBy ); // Make sure
 
                $this->addOption( 'LIMIT', $params['limit'] + 1 );
-               $offset = isset( $params['offset'] ) ? $params['offset'] : 0;
-               if ( $offset ) {
-                       $this->addOption( 'OFFSET', $params['offset'] );
+
+               if ( $params['continue'] !== null ) {
+                       $cont = explode( '|', $params['continue'] );
+                       $this->dieContinueUsageIf( count( $cont ) !== count( 
$orderBy ) );
+                       $i = count( $cont ) - 1;
+                       $cond = $orderBy[$i] . ' >= ' . $db->addQuotes( 
rawurldecode( $cont[$i] ) );
+                       while ( $i-- > 0 ) {
+                               $field = $orderBy[$i];
+                               $v = $db->addQuotes( rawurldecode( $cont[$i] ) 
);
+                               $cond = "($field > $v OR ($field = $v AND 
$cond))";
+                       }
+                       $this->addWhere( $cond );
                }
 
                $res = $this->select( __METHOD__ );
@@ -82,7 +112,7 @@
                        if ( ++$count > $params['limit'] ) {
                                // We've reached the one extra which shows that
                                // there are additional pages to be had. Stop 
here...
-                               $this->setContinueEnumParameter( 'offset', 
$offset + $params['limit'] );
+                               $this->setContinue( $orderBy, $row );
                                break;
                        }
                        $entry = [];
@@ -94,10 +124,18 @@
                        ApiResult::setContentValue( $entry, 'url', $to );
                        $fit = $this->addPageSubItem( $row->el_from, $entry );
                        if ( !$fit ) {
-                               $this->setContinueEnumParameter( 'offset', 
$offset + $count - 1 );
+                               $this->setContinue( $orderBy, $row );
                                break;
                        }
                }
+       }
+
+       private function setContinue( $orderBy, $row ) {
+               $fields = [];
+               foreach ( $orderBy as $field ) {
+                       $fields[] = rawurlencode( $row->$field );
+               }
+               $this->setContinueEnumParameter( 'continue', implode( '|', 
$fields ) );
        }
 
        public function getCacheMode( $params ) {
@@ -113,8 +151,7 @@
                                ApiBase::PARAM_MAX => ApiBase::LIMIT_BIG1,
                                ApiBase::PARAM_MAX2 => ApiBase::LIMIT_BIG2
                        ],
-                       'offset' => [
-                               ApiBase::PARAM_TYPE => 'integer',
+                       'continue' => [
                                ApiBase::PARAM_HELP_MSG => 
'api-help-param-continue',
                        ],
                        'protocol' => [
diff --git a/includes/deferred/LinksUpdate.php 
b/includes/deferred/LinksUpdate.php
index b7f0b7e..95908e3 100644
--- a/includes/deferred/LinksUpdate.php
+++ b/includes/deferred/LinksUpdate.php
@@ -538,7 +538,7 @@
                $arr = [];
                $diffs = array_diff_key( $this->mExternals, $existing );
                foreach ( $diffs as $url => $dummy ) {
-                       foreach ( wfMakeUrlIndexes( $url ) as $index ) {
+                       foreach ( LinkFilter::makeIndexes( $url ) as $index ) {
                                $arr[] = [
                                        'el_id' => 
$this->getDB()->nextSequenceValue( 'externallinks_el_id_seq' ),
                                        'el_from' => $this->mId,
diff --git a/includes/installer/DatabaseUpdater.php 
b/includes/installer/DatabaseUpdater.php
index 8376223..8247968 100644
--- a/includes/installer/DatabaseUpdater.php
+++ b/includes/installer/DatabaseUpdater.php
@@ -78,6 +78,7 @@
                FixDefaultJsonContentPages::class,
                CleanupEmptyCategories::class,
                AddRFCAndPMIDInterwiki::class,
+               RefreshExternallinksIndex::class,
        ];
 
        /**
diff --git a/includes/parser/Parser.php b/includes/parser/Parser.php
index 10dfd26..faba158 100644
--- a/includes/parser/Parser.php
+++ b/includes/parser/Parser.php
@@ -1954,7 +1954,19 @@
         * @return string
         */
        public static function normalizeLinkUrl( $url ) {
-               # First, make sure unsafe characters are encoded
+               # Test for RFC 3986 IPv6 syntax
+               $scheme = '[a-z][a-z0-9+.-]*:';
+               $userinfo = '(?:[a-z0-9\-._~!$&\'()*+,;=:]|%[0-9a-f]{2})*';
+               $ipv6Host = '\\[((?:[0-9a-f:]|%3[0-A]|%[46][1-6])+)\\]';
+               if ( preg_match( 
"<^(?:{$scheme})?//(?:{$userinfo}@)?{$ipv6Host}(?:[:/?#].*|)$>i", $url, $m ) &&
+                       IP::isValid( rawurldecode( $m[1] ) )
+               ) {
+                       $isIPv6 = rawurldecode( $m[1] );
+               } else {
+                       $isIPv6 = false;
+               }
+
+               # Make sure unsafe characters are encoded
                $url = preg_replace_callback( 
'/[\x00-\x20"<>\[\\\\\]^`{|}\x7F-\xFF]/',
                        function ( $m ) {
                                return rawurlencode( $m[0] );
@@ -1986,6 +1998,16 @@
                $ret = self::normalizeUrlComponent(
                        substr( $url, 0, $end ), '"#%<>[\]^`{|}/?' ) . $ret;
 
+               # Fix IPv6 syntax
+               if ( $isIPv6 !== false ) {
+                       $ipv6Host = "%5B({$isIPv6})%5D";
+                       $ret = preg_replace(
+                               
"<^((?:{$scheme})?//(?:{$userinfo}@)?){$ipv6Host}(?=[:/?#]|$)>i",
+                               "$1[$2]",
+                               $ret
+                       );
+               }
+
                return $ret;
        }
 
diff --git a/includes/specials/SpecialLinkSearch.php 
b/includes/specials/SpecialLinkSearch.php
index a2fa844..b2f7618 100644
--- a/includes/specials/SpecialLinkSearch.php
+++ b/includes/specials/SpecialLinkSearch.php
@@ -143,37 +143,6 @@
                return false;
        }
 
-       /**
-        * Return an appropriately formatted LIKE query and the clause
-        *
-        * @param string $query Search pattern to search for
-        * @param string $prot Protocol, e.g. 'http://'
-        *
-        * @return array
-        */
-       static function mungeQuery( $query, $prot ) {
-               $field = 'el_index';
-               $dbr = wfGetDB( DB_REPLICA );
-
-               if ( $query === '*' && $prot !== '' ) {
-                       // Allow queries like 'ftp://*' to find all ftp links
-                       $rv = [ $prot, $dbr->anyString() ];
-               } else {
-                       $rv = LinkFilter::makeLikeArray( $query, $prot );
-               }
-
-               if ( $rv === false ) {
-                       // LinkFilter doesn't handle wildcard in IP, so we'll 
have to munge here.
-                       $pattern = 
'/^(:?[0-9]{1,3}\.)+\*\s*$|^(:?[0-9]{1,3}\.){3}[0-9]{1,3}:[0-9]*\*\s*$/';
-                       if ( preg_match( $pattern, $query ) ) {
-                               $rv = [ $prot . rtrim( $query, " \t*" ), 
$dbr->anyString() ];
-                               $field = 'el_to';
-                       }
-               }
-
-               return [ $rv, $field ];
-       }
-
        function linkParameters() {
                $params = [];
                $params['target'] = $this->mProt . $this->mQuery;
@@ -186,16 +155,26 @@
 
        public function getQueryInfo() {
                $dbr = wfGetDB( DB_REPLICA );
-               // strip everything past first wildcard, so that
-               // index-based-only lookup would be done
-               list( $this->mungedQuery, $clause ) = self::mungeQuery( 
$this->mQuery, $this->mProt );
+
+               if ( $this->mQuery === '*' && $this->mProt !== '' ) {
+                       $this->mungedQuery = [
+                               'el_index_60' . $dbr->buildLike( $this->mProt, 
$dbr->anyString() ),
+                       ];
+               } else {
+                       $this->mungedQuery = LinkFilter::getQueryConditions( 
$this->mQuery,
+                               [ 'protocol' => $this->mProt, 'oneWildcard' => 
true, 'db' => $dbr ] );
+               }
                if ( $this->mungedQuery === false ) {
                        // Invalid query; return no results
                        return [ 'tables' => 'page', 'fields' => 'page_id', 
'conds' => '0=1' ];
                }
 
-               $stripped = LinkFilter::keepOneWildcard( $this->mungedQuery );
-               $like = $dbr->buildLike( $stripped );
+               $orderBy = [];
+               if ( !isset( $this->mungedQuery['el_index_60'] ) ) {
+                       $orderBy[] = 'el_index_60';
+               }
+               $orderBy[] = 'el_id';
+
                $retval = [
                        'tables' => [ 'page', 'externallinks' ],
                        'fields' => [
@@ -204,11 +183,13 @@
                                'value' => 'el_index',
                                'url' => 'el_to'
                        ],
-                       'conds' => [
-                               'page_id = el_from',
-                               "$clause $like"
-                       ],
-                       'options' => [ 'USE INDEX' => $clause ]
+                       'conds' => array_merge(
+                               [
+                                       'page_id = el_from',
+                               ],
+                               $this->mungedQuery
+                       ),
+                       'options' => [ 'ORDER BY' => $orderBy ]
                ];
 
                if ( $this->mNs !== null && !$this->getConfig()->get( 
'MiserMode' ) ) {
@@ -245,9 +226,7 @@
 
        /**
         * Override to squash the ORDER BY.
-        * We do a truncated index search, so the optimizer won't trust
-        * it as good enough for optimizing sort. The implicit ordering
-        * from the scan will usually do well enough for our needs.
+        * Not much point in descending order here.
         * @return array
         */
        function getOrderFields() {
diff --git a/maintenance/cleanupSpam.php b/maintenance/cleanupSpam.php
index 4e47cfb..758b1ea 100644
--- a/maintenance/cleanupSpam.php
+++ b/maintenance/cleanupSpam.php
@@ -54,8 +54,8 @@
                        $wgUser->addToDatabase();
                }
                $spec = $this->getArg();
-               $like = LinkFilter::makeLikeArray( $spec );
-               if ( !$like ) {
+               $conds = LinkFilter::getQueryConditions( $spec );
+               if ( !$conds ) {
                        $this->error( "Not a valid hostname specification: 
$spec", true );
                }
 
@@ -66,8 +66,7 @@
                        foreach ( $wgLocalDatabases as $wikiID ) {
                                $dbr = $this->getDB( DB_REPLICA, [], $wikiID );
 
-                               $count = $dbr->selectField( 'externallinks', 
'COUNT(*)',
-                                       [ 'el_index' . $dbr->buildLike( $like ) 
], __METHOD__ );
+                               $count = $dbr->selectField( 'externallinks', 
'COUNT(*)', $conds, __METHOD__ );
                                if ( $count ) {
                                        $found = true;
                                        $cmd = wfShellWikiCmd( 
"$IP/maintenance/cleanupSpam.php",
@@ -84,8 +83,7 @@
                        // Clean up spam on this wiki
 
                        $dbr = $this->getDB( DB_REPLICA );
-                       $res = $dbr->select( 'externallinks', [ 'DISTINCT 
el_from' ],
-                               [ 'el_index' . $dbr->buildLike( $like ) ], 
__METHOD__ );
+                       $res = $dbr->select( 'externallinks', [ 'DISTINCT 
el_from' ], $conds, __METHOD__ );
                        $count = $dbr->numRows( $res );
                        $this->output( "Found $count articles containing 
$spec\n" );
                        foreach ( $res as $row ) {
diff --git a/maintenance/deleteSelfExternals.php 
b/maintenance/deleteSelfExternals.php
index ed15fd1..6d87d37 100644
--- a/maintenance/deleteSelfExternals.php
+++ b/maintenance/deleteSelfExternals.php
@@ -38,18 +38,45 @@
 
        public function execute() {
                global $wgServer;
+
+               // Extract the host and scheme from $wgServer
+               $bits = wfParseUrl( $wgServer );
+               if ( !$bits ) {
+                       $this->error( "Could not parse $wgServer" );
+                       exit( 1 );
+               }
+
                $this->output( "Deleting self externals from $wgServer\n" );
                $db = $this->getDB( DB_MASTER );
-               while ( 1 ) {
-                       wfWaitForSlaves();
-                       $this->commitTransaction( $db, __METHOD__ );
-                       $q = $db->limitResult( "DELETE /* deleteSelfExternals 
*/ FROM externallinks WHERE el_to"
-                               . $db->buildLike( $wgServer . '/', 
$db->anyString() ), $this->mBatchSize );
-                       $this->output( "Deleting a batch\n" );
-                       $db->query( $q );
-                       if ( !$db->affectedRows() ) {
-                               return;
+
+               // If it's protocol-relative, we need to do both http and https.
+               // Otherwise, just do the specified scheme.
+               $host = $bits['host'];
+               if ( isset( $bits['port'] ) ) {
+                       $host .= ':' . $bits['port'];
+               }
+               if ( $bits['scheme'] != '' ) {
+                       $conds = [ LinkFilter::getQueryConditions( $host, [ 
'protocol' => $bits['scheme'] . '://' ] ) ];
+               } else {
+                       $conds = [
+                               LinkFilter::getQueryConditions( $host, [ 
'protocol' => 'http://' ] ),
+                               LinkFilter::getQueryConditions( $host, [ 
'protocol' => 'https://' ] ),
+                       ];
+               }
+
+               foreach ( $conds as $cond ) {
+                       if ( !$cond ) {
+                               continue;
                        }
+                       $cond = $db->makeList( $cond, LIST_AND );
+                       do {
+                               wfWaitForSlaves();
+                               $this->commitTransaction( $db, __METHOD__ );
+                               $q = $db->limitResult( "DELETE /* 
deleteSelfExternals */ FROM externallinks WHERE $cond",
+                                       $this->mBatchSize );
+                               $this->output( "Deleting a batch\n" );
+                               $db->query( $q );
+                       } while ( $db->affectedRows() );
                }
        }
 }
diff --git a/maintenance/refreshExternallinksIndex.php 
b/maintenance/refreshExternallinksIndex.php
new file mode 100644
index 0000000..3da009a
--- /dev/null
+++ b/maintenance/refreshExternallinksIndex.php
@@ -0,0 +1,109 @@
+<?php
+/**
+ * Refresh the externallinks table el_index and el_index_60 from el_to
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @file
+ * @ingroup Maintenance
+ */
+
+require_once __DIR__ . '/Maintenance.php';
+
+/**
+ * Maintenance script that refreshes the externallinks table el_index and
+ * el_index_60 from el_to
+ *
+ * @ingroup Maintenance
+ * @since 1.29
+ */
+class RefreshExternallinksIndex extends LoggedUpdateMaintenance {
+       public function __construct() {
+               parent::__construct();
+               $this->addDescription(
+                       'Refresh the externallinks table el_index and 
el_index_60 from el_to' );
+               $this->setBatchSize( 200 );
+       }
+
+       protected function getUpdateKey() {
+               return 'refresh externallinks v' . LinkFilter::VERSION;
+       }
+
+       protected function updateSkippedMessage() {
+               return 'externallinks table indexes up to date';
+       }
+
+       protected function doDBUpdates() {
+               $db = $this->getDB( DB_MASTER );
+               if ( !$db->tableExists( 'externallinks' ) ) {
+                       $this->error( "externallinks table does not exist" );
+                       return false;
+               }
+               $this->output( "Updating externallinks table index fields\n" );
+
+               $count = 0;
+               $start = 0;
+               $last = $db->selectField( 'externallinks', 'MAX(el_id)', false, 
__METHOD__ );
+               while ( $start <= $last ) {
+                       $end = $start + $this->mBatchSize;
+                       $this->output( "el_id $start - $end of $last\n" );
+                       $res = $db->select( 'externallinks', [ 'el_id', 
'el_to', 'el_index' ],
+                               [
+                                       "el_id > $start",
+                                       "el_id <= $end",
+                               ],
+                               __METHOD__,
+                               [ 'ORDER BY' => 'el_id' ]
+                       );
+                       foreach ( $res as $row ) {
+                               $newIndexes = LinkFilter::makeIndexes( 
$row->el_to );
+                               if ( !$newIndexes ) {
+                                       $this->error( "No new indexes for 
\"{$row->el_to}\"\n" );
+                                       continue;
+                               }
+                               if ( in_array( $row->el_index, $newIndexes, 
true ) ) {
+                                       continue;
+                               }
+
+                               $count++;
+                               if ( count( $newIndexes ) === 1 ) {
+                                       $newIndex = $newIndexes[0];
+                               } else {
+                                       // Assume the scheme is the only 
difference and shouldn't change
+                                       $newIndex = substr( $row->el_index, 0, 
strpos( $row->el_index, ':' ) ) .
+                                               substr( $newIndexes[0], strpos( 
$newIndexes[0], ':' ) );
+                               }
+                               $db->update( 'externallinks',
+                                       [
+                                               'el_index' => $newIndex,
+                                               'el_index_60' => substr( 
$newIndex, 0, 60 ),
+                                       ],
+                                       [
+                                               'el_id' => $row->el_id,
+                                       ], __METHOD__, [ 'IGNORE' ]
+                               );
+                       }
+                       wfWaitForSlaves();
+                       $start = $end;
+               }
+               $this->output( "Done, $count rows updated.\n" );
+
+               return true;
+       }
+}
+
+$maintClass = "RefreshExternallinksIndex";
+require_once RUN_MAINTENANCE_IF_MAIN;
diff --git a/tests/phpunit/includes/GlobalFunctions/GlobalTest.php 
b/tests/phpunit/includes/GlobalFunctions/GlobalTest.php
index 1d48d08..8b66474 100644
--- a/tests/phpunit/includes/GlobalFunctions/GlobalTest.php
+++ b/tests/phpunit/includes/GlobalFunctions/GlobalTest.php
@@ -541,63 +541,6 @@
        }
 
        /**
-        * @dataProvider provideMakeUrlIndexes()
-        * @covers ::wfMakeUrlIndexes
-        */
-       public function testMakeUrlIndexes( $url, $expected ) {
-               $index = wfMakeUrlIndexes( $url );
-               $this->assertEquals( $expected, $index, 
"wfMakeUrlIndexes(\"$url\")" );
-       }
-
-       public static function provideMakeUrlIndexes() {
-               return [
-                       // Testcase for T30627
-                       [
-                               'https://example.org/test.cgi?id=12345',
-                               [ 'https://org.example./test.cgi?id=12345' ]
-                       ],
-                       [
-                               // mailtos are handled special
-                               // is this really right though? that final . 
probably belongs earlier?
-                               'mailto:w...@wikimedia.org',
-                               [ 'mailto:org.wikimedia@wiki.' ]
-                       ],
-
-                       // file URL cases per T30627...
-                       [
-                               // three slashes: local filesystem path 
Unix-style
-                               'file:///whatever/you/like.txt',
-                               [ 'file://./whatever/you/like.txt' ]
-                       ],
-                       [
-                               // three slashes: local filesystem path 
Windows-style
-                               'file:///c:/whatever/you/like.txt',
-                               [ 'file://./c:/whatever/you/like.txt' ]
-                       ],
-                       [
-                               // two slashes: UNC filesystem path 
Windows-style
-                               'file://intranet/whatever/you/like.txt',
-                               [ 'file://intranet./whatever/you/like.txt' ]
-                       ],
-                       // Multiple-slash cases that can sorta work on Mozilla
-                       // if you hack it just right are kinda pathological,
-                       // and unreliable cross-platform or on IE which means 
they're
-                       // unlikely to appear on intranets.
-                       // Those will survive the algorithm but with results 
that
-                       // are less consistent.
-
-                       // protocol-relative URL cases per T31854...
-                       [
-                               '//example.org/test.cgi?id=12345',
-                               [
-                                       'http://org.example./test.cgi?id=12345',
-                                       'https://org.example./test.cgi?id=12345'
-                               ]
-                       ],
-               ];
-       }
-
-       /**
         * @dataProvider provideWfMatchesDomainList
         * @covers ::wfMatchesDomainList
         */
diff --git a/tests/phpunit/includes/LinkFilterTest.php 
b/tests/phpunit/includes/LinkFilterTest.php
index 428b012..a78627d 100644
--- a/tests/phpunit/includes/LinkFilterTest.php
+++ b/tests/phpunit/includes/LinkFilterTest.php
@@ -124,6 +124,20 @@
                                
'http://xx23124:__ffdfde...@www.test.com:12345/dir' ,
                                
'http://name:p...@www.test.com:12345/dir/dir/file.xyz.php#__se__?arg1=_&arg2[]=4rtg'
                        ],
+                       [ 'http://', '127.0.0.1', 'http://127.000.000.001' ],
+                       [ 'http://', '127.0.0.*', 'http://127.000.000.010' ],
+                       [ 'http://', '127.0.*', 'http://127.000.123.010' ],
+                       [ 'http://', '127.*', 'http://127.127.127.127' ],
+                       [ 'http://', '[0:0:0:0:0:0:0:0001]', 'http://[::1]' ],
+                       [ 'http://', '[2001:db8:0:0:*]', 'http://[2001:0DB8::]' 
],
+                       [ 'http://', '[2001:db8:0:0:*]', 
'http://[2001:0DB8::123]' ],
+                       [ 'http://', '[2001:db8:0:0:*]', 
'http://[2001:0DB8::123:456]' ],
+                       [ 'http://', 'xn--f-vgaa.example.com', 
'http://fóó.example.com' ],
+                       [ 'http://', 'xn--f-vgaa.example.com', 
'http://f%c3%b3%C3%B3.example.com' ],
+                       [ 'http://', 'fóó.example.com', 
'http://xn--f-vgaa.example.com' ],
+                       [ 'http://', 'f%c3%b3%C3%B3.example.com', 
'http://xn--f-vgaa.example.com' ],
+                       [ 'http://', 'f%c3%b3%C3%B3.example.com', 
'http://fóó.example.com' ],
+                       [ 'http://', 'fóó.example.com', 
'http://f%c3%b3%C3%B3.example.com' ],
 
                        // Tests for false positives
                        [ 'http://', 'test.com', 'http://www.test.com', false ],
@@ -151,6 +165,8 @@
                        [ 'ftp://', 'test.com/dir/', 'ftp://test.com/', false ],
                        [ '', 'http://test.com:8080/dir/', 
'http://test.com:808/dir/', false ],
                        [ '', 'http://test.com/dir/index.html', 
'http://test.com/dir/index.php', false ],
+                       [ 'http://', '127.0.0.*', 'http://127.0.1.0', false ],
+                       [ 'http://', '[2001:db8::*]', 
'http://[2001:0DB8::123:456]', false ],
 
                        // These are false positives too and ideally shouldn't 
match, but that
                        // would require using regexes and RLIKE instead of LIKE
@@ -164,17 +180,17 @@
         * testMakeLikeArrayWithValidPatterns()
         *
         * Tests whether the LIKE clause produced by 
LinkFilter::makeLikeArray($pattern, $protocol)
-        * will find one of the URL indexes produced by wfMakeUrlIndexes($url)
+        * will find one of the URL indexes produced by 
LinkFilter::makeIndexes($url)
         *
         * @dataProvider provideValidPatterns
         *
         * @param string $protocol Protocol, e.g. 'http://' or 'mailto:'
         * @param string $pattern Search pattern to feed to 
LinkFilter::makeLikeArray
-        * @param string $url URL to feed to wfMakeUrlIndexes
+        * @param string $url URL to feed to LinkFilter::makeIndexes
         * @param bool $shouldBeFound Should the URL be found? (defaults true)
         */
        function testMakeLikeArrayWithValidPatterns( $protocol, $pattern, $url, 
$shouldBeFound = true ) {
-               $indexes = wfMakeUrlIndexes( $url );
+               $indexes = LinkFilter::makeIndexes( $url );
                $likeArray = LinkFilter::makeLikeArray( $pattern, $protocol );
 
                $this->assertTrue( $likeArray !== false,
@@ -183,7 +199,7 @@
 
                $regex = $this->createRegexFromLIKE( $likeArray );
                $debugmsg = "Regex: '" . $regex . "'\n";
-               $debugmsg .= count( $indexes ) . " index(es) created by 
wfMakeUrlIndexes():\n";
+               $debugmsg .= count( $indexes ) . " index(es) created by 
LinkFilter::makeIndexes():\n";
 
                $matches = 0;
 
@@ -248,4 +264,148 @@
                );
        }
 
+       /**
+        * @dataProvider provideMakeIndexes()
+        * @covers LinkFilter::makeIndexes
+        */
+       public function testMakeIndexes( $url, $expected ) {
+               // Set global so file:// tests can work
+               $this->setMwGlobals( [
+                       'wgUrlProtocols' => [
+                               'http://',
+                               'https://',
+                               'mailto:',
+                               '//',
+                               'file://', # Non-default
+                       ],
+               ] );
+
+               $index = LinkFilter::makeIndexes( $url );
+               $this->assertEquals( $expected, $index, 
"LinkFilter::makeIndexes(\"$url\")" );
+       }
+
+       public static function provideMakeIndexes() {
+               return [
+                       // Testcase for T30627
+                       [
+                               'https://example.org/test.cgi?id=12345',
+                               [ 'https://org.example./test.cgi?id=12345' ]
+                       ],
+                       [
+                               // mailtos are handled special
+                               'mailto:w...@wikimedia.org',
+                               [ 'mailto:org.wikimedia.@wiki' ]
+                       ],
+
+                       // file URL cases per T30627...
+                       [
+                               // three slashes: local filesystem path 
Unix-style
+                               'file:///whatever/you/like.txt',
+                               [ 'file://./whatever/you/like.txt' ]
+                       ],
+                       [
+                               // three slashes: local filesystem path 
Windows-style
+                               'file:///c:/whatever/you/like.txt',
+                               [ 'file://./c:/whatever/you/like.txt' ]
+                       ],
+                       [
+                               // two slashes: UNC filesystem path 
Windows-style
+                               'file://intranet/whatever/you/like.txt',
+                               [ 'file://intranet./whatever/you/like.txt' ]
+                       ],
+                       // Multiple-slash cases that can sorta work on Mozilla
+                       // if you hack it just right are kinda pathological,
+                       // and unreliable cross-platform or on IE which means 
they're
+                       // unlikely to appear on intranets.
+                       // Those will survive the algorithm but with results 
that
+                       // are less consistent.
+
+                       // protocol-relative URL cases per T31854...
+                       [
+                               '//example.org/test.cgi?id=12345',
+                               [
+                                       'http://org.example./test.cgi?id=12345',
+                                       'https://org.example./test.cgi?id=12345'
+                               ]
+                       ],
+               ];
+       }
+
+       /**
+        * @dataProvider provideGetQueryConditions
+        * @covers LinkFilter::getQueryConditions
+        */
+       public function testGetQueryConditions( $query, $options, $expected ) {
+               $conds = LinkFilter::getQueryConditions( $query, $options );
+               $this->assertEquals( $expected, $conds );
+       }
+
+       public static function provideGetQueryConditions() {
+               return [
+                       'Basic example' => [
+                               'example.com',
+                               [],
+                               [
+                                       'el_index_60 >= 
\'http://com.example./\'',
+                                       'el_index_60 < 
\'http://com.example.0\'',
+                                       'el_index LIKE 
\'http://com.example./%\' ',
+                               ],
+                       ],
+                       'Basic example with path' => [
+                               'example.com/foobar',
+                               [],
+                               [
+                                       'el_index_60 >= 
\'http://com.example./foobar\'',
+                                       'el_index_60 < 
\'http://com.example./foobas\'',
+                                       'el_index LIKE 
\'http://com.example./foobar%\' ',
+                               ],
+                       ],
+                       'Wildcard domain' => [
+                               '*.example.com',
+                               [],
+                               [
+                                       'el_index_60 >= 
\'http://com.example.\'',
+                                       'el_index_60 < \'http://com.example/\'',
+                                       'el_index LIKE \'http://com.example.%\' 
',
+                               ],
+                       ],
+                       'Wildcard domain with path' => [
+                               '*.example.com/foobar',
+                               [],
+                               [
+                                       'el_index_60 >= 
\'http://com.example.\'',
+                                       'el_index_60 < \'http://com.example/\'',
+                                       'el_index LIKE 
\'http://com.example.%/foobar%\' ',
+                               ],
+                       ],
+                       'Wildcard domain with path, oneWildcard=true' => [
+                               '*.example.com/foobar',
+                               [ 'oneWildcard' => true ],
+                               [
+                                       'el_index_60 >= 
\'http://com.example.\'',
+                                       'el_index_60 < \'http://com.example/\'',
+                                       'el_index LIKE \'http://com.example.%\' 
',
+                               ],
+                       ],
+                       'Constant prefix' => [
+                               
'example.com/blah/blah/blah/blah/blah/blah/blah/blah/blah/blah?foo=',
+                               [],
+                               [
+                                       'el_index_60' => 
'http://com.example./blah/blah/blah/blah/blah/blah/blah/blah/',
+                                       'el_index LIKE ' .
+                                               
'\'http://com.example./blah/blah/blah/blah/blah/blah/blah/blah/blah/blah?foo=%\'
 ',
+                               ],
+                       ],
+                       'Various options' => [
+                               'example.com',
+                               [ 'protocol' => 'https://', 'prefix' => 'xx' ],
+                               [
+                                       'xx_index_60 >= 
\'https://com.example./\'',
+                                       'xx_index_60 < 
\'https://com.example.0\'',
+                                       'xx_index LIKE 
\'https://com.example./%\' ',
+                               ],
+                       ],
+               ];
+       }
+
 }
diff --git a/tests/phpunit/includes/parser/ParserMethodsTest.php 
b/tests/phpunit/includes/parser/ParserMethodsTest.php
index 5e00384..e3c2c80 100644
--- a/tests/phpunit/includes/parser/ParserMethodsTest.php
+++ b/tests/phpunit/includes/parser/ParserMethodsTest.php
@@ -184,6 +184,16 @@
                                
'http://example.org/%23%2F%3F%26%3D%2B%3B?%23%2F%3F%26%3D%2B%3B#%23%2F%3F%26%3D%2B%3B',
                                
'http://example.org/%23%2F%3F&=+;?%23/?%26%3D%2B%3B#%23/?&=+;',
                        ],
+                       [
+                               'IPv6 links aren\'t escaped',
+                               'http://[::1]/foobar',
+                               'http://[::1]/foobar',
+                       ],
+                       [
+                               'non-IPv6 links aren\'t unescaped',
+                               'http://%5B::1%5D/foobar',
+                               'http://%5B::1%5D/foobar',
+                       ],
                ];
        }
 

-- 
To view, visit https://gerrit.wikimedia.org/r/322729
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I84d224ef23de22dfe179009ec3a11fd0e4b5f56d
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/core
Gerrit-Branch: master
Gerrit-Owner: Anomie <bjor...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to