Smalyshev has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/326039 )
Change subject: [WIP] Move geosearch features to GeoData extension. ...................................................................... [WIP] Move geosearch features to GeoData extension. Depends on Id08efd46337a977639ebf3724ee3492512f326ac to actually work, but since it works via hook, can be merged independently. Bug: T152730 Change-Id: I24eb69b1aa91c528a230432164f6441593d95365 --- M extension.json M includes/Hooks.php A includes/Search/CirrusGeoFeature.php R includes/Search/CoordinatesIndexField.php R includes/Search/GeoPointIndexField.php A includes/Search/GeoRadiusFunctionScoreBuilder.php A tests/phpunit/GeoFeatureTest.php 7 files changed, 573 insertions(+), 2 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/GeoData refs/changes/39/326039/1 diff --git a/extension.json b/extension.json index f9f82a4..15299df 100644 --- a/extension.json +++ b/extension.json @@ -24,12 +24,14 @@ "GeoData\\ApiQueryGeoSearchDb": "includes/api/ApiQueryGeoSearchDb.php", "GeoData\\ApiQueryGeoSearchElastic": "includes/api/ApiQueryGeoSearchElastic.php", "GeoData\\BoundingBox": "includes/BoundingBox.php", + "GeoData\\CirrusGeoFeature": "includes/Search/CirrusGeoFeature.php", "GeoData\\Coord": "includes/Coord.php", - "GeoData\\CoordinatesIndexField" : "includes/CoordinatesIndexField.php", + "GeoData\\CoordinatesIndexField" : "includes/Search/CoordinatesIndexField.php", "GeoData\\CoordinatesOutput": "includes/CoordinatesOutput.php", "GeoData\\CoordinatesParserFunction": "includes/CoordinatesParserFunction.php", "GeoData\\GeoData": "includes/GeoData.body.php", - "GeoData\\GeoPointIndexField" : "includes/GeoPointIndexField.php", + "GeoData\\GeoPointIndexField" : "includes/Search/GeoPointIndexField.php", + "GeoData\\GeoRadiusFunctionScoreBuilder": "includes/Search/GeoRadiusFunctionScoreBuilder.php", "GeoData\\Globe": "includes/Globe.php", "GeoData\\Hooks": "includes/Hooks.php", "GeoData\\Math": "includes/Math.php", @@ -45,6 +47,7 @@ "SearchIndexFields": "GeoData\\Hooks::onSearchIndexFields", "SearchDataForIndex": "GeoData\\Hooks::onSearchDataForIndex", "ParserTestTables": "GeoData\\Hooks::onParserTestTables", + "CirrusSearchAddQueryFeatures": "GeoData\\Hooks::onCirrusSearchAddQueryFeatures", "ApiQuery::moduleManager": "GeoData\\Hooks::onApiQueryModuleManager" }, "TrackingCategories": [ @@ -115,6 +118,12 @@ "GeoDataUseCirrusSearch": { "value": false }, + "GeoDataRadiusScoreOverrides": { + "value": { + "config_override": "CirrusSearchPreferGeoRadiusWeight", + "uri_param_override": "cirrusPreferGeoRadiusWeight" + } + }, "GeoDataDebug": { "value": false } diff --git a/includes/Hooks.php b/includes/Hooks.php index 133a73c..ae86c60 100644 --- a/includes/Hooks.php +++ b/includes/Hooks.php @@ -4,6 +4,7 @@ use ApiModuleManager; use Article; +use CirrusSearch\SearchConfig; use Content; use DatabaseUpdater; use LinksUpdate; @@ -349,4 +350,13 @@ ); } } + + /** + * Add geo-search feature to search syntax + * @param $config + * @param array $features + */ + public static function onCirrusSearchAddQueryFeatures( $config, array &$features ) { + $features[] = new CirrusGeoFeature(); + } } diff --git a/includes/Search/CirrusGeoFeature.php b/includes/Search/CirrusGeoFeature.php new file mode 100644 index 0000000..4631c52 --- /dev/null +++ b/includes/Search/CirrusGeoFeature.php @@ -0,0 +1,225 @@ +<?php + +namespace GeoData; + +use CirrusSearch\Query\SimpleKeywordFeature; +use CirrusSearch\Search\SearchContext; +use CirrusSearch\SearchConfig; +use Elastica\Query\AbstractQuery; +use Title; + +/** + * Applies geo based features to the query. + * + * Two forms of geo based querying are provided: a filter that limits search + * results to a geographic area and a boost that increases the score of + * results within the geographic area. Supports specifying geo coordinates + * either by providing a latitude and longitude, or a page title to source the + * latitude and longitude from. All values can be prefixed with a radius in m + * or km to apply. If not specified this defaults to 5km. + * + * Examples: + * neartitle:Shanghai + * neartitle:50km,Seoul + * nearcoord:1.2345,-5.4321 + * nearcoord:17km,54.321,-12.345 + * boost-neartitle:"San Francisco" + * boost-neartitle:50km,Kampala + * boost-nearcoord:-12.345,87.654 + * boost-nearcoord:77km,34.567,76.543 + */ +class CirrusGeoFeature extends SimpleKeywordFeature { + // Default radius, in meters + const DEFAULT_RADIUS = 5000; + // Default globe + const DEFAULT_GLOBE = 'earth'; + + /** + * @return string[] + */ + protected function getKeywords() { + return ['boost-nearcoord', 'boost-neartitle', 'nearcoord', 'neartitle']; + } + + /** + * @param SearchContext $context + * @param string $key The keyword + * @param string $value The value attached to the keyword with quotes stripped + * @param string $quotedValue The original value in the search string, including quotes if used + * @param bool $negated Is the search negated? Not used to generate the returned AbstractQuery, + * that will be negated as necessary. Used for any other building/context necessary. + * @return array Two element array, first an AbstractQuery or null to apply to the + * query. Second a boolean indicating if the quotedValue should be kept in the search + * string. + */ + protected function doApply( SearchContext $context, $key, $value, $quotedValue, $negated ) { + if ( substr( $key, -5 ) === 'title' ) { + list( $coord, $radius, $excludeDocId ) = $this->parseGeoNearbyTitle( + $context->getConfig(), + $value + ); + } else { + list( $coord, $radius ) = $this->parseGeoNearby( $value ); + $excludeDocId = ''; + } + + $filter = null; + if ( $coord ) { + if ( substr( $key, 0, 6 ) === 'boost-' ) { + $this->getRescoreBuilder( $context, $negated ? 0.1 : 1, $coord, $radius ); + } else { + $filter = self::createQuery( $coord, $radius, $excludeDocId ); + } + } + + return [ $filter, false ]; + } + + /** + * Create rescore builder for geo search functions + * @param SearchContext $context + * @param Coord $coord + * @param int $radius + * @param float $weight + */ + protected function getRescoreBuilder( SearchContext $context, $weight, Coord $coord, $radius ) { + $builder = new GeoRadiusFunctionScoreBuilder( $context, $weight, $coord, $radius ); + $context->addCustomRescoreComponent( $builder ); + } + + /** + * radius, if provided, must have either m or km suffix. Valid formats: + * <title> + * <radius>,<title> + * + * @param SearchConfig $config the Cirrus config object + * @param string $text user input to parse + * @return array Three member array with Coordinate object, integer radius + * in meters, and page id to exclude from results.. When invalid the + * Coordinate returned will be null. + */ + public function parseGeoNearbyTitle( SearchConfig $config, $text ) { + $title = Title::newFromText( $text ); + if ( $title && $title->exists() ) { + // Default radius if not provided: 5km + $radius = self::DEFAULT_RADIUS; + } else { + // If the provided value is not a title try to extract a radius prefix + // from the beginning. If $text has a valid radius prefix see if the + // remaining text is a valid title to use. + $pieces = explode( ',', $text, 2 ); + if ( count( $pieces ) !== 2 ) { + return [ null, 0, '' ]; + } + $radius = $this->parseDistance( $pieces[0] ); + if ( $radius === null ) { + return [ null, 0, '' ]; + } + $title = Title::newFromText( $pieces[1] ); + if ( !$title || !$title->exists() ) { + return [ null, 0, '' ]; + } + } + + $coord = GeoData::getPageCoordinates( $title ); + if ( !$coord ) { + return [ null, 0, '' ]; + } + + return [ $coord, $radius, $config->makeId( $title->getArticleID() ) ]; + } + + /** + * radius, if provided, must have either m or km suffix. Latitude and longitude + * must be floats in the domain of [-90:90] for latitude and [-180,180] for + * longitude. Valid formats: + * <lat>,<lon> + * <radius>,<lat>,<lon> + * + * @param string $text + * @return array Two member array with Coordinate object, and integer radius + * in meters. When invalid the Coordinate returned will be null. + */ + public function parseGeoNearby( $text ) { + $pieces = explode( ',', $text, 3 ); + // Default radius if not provided: 5km + $radius = self::DEFAULT_RADIUS; + if ( count( $pieces ) === 3 ) { + $radius = $this->parseDistance( $pieces[0] ); + if ( $radius === null ) { + return [ null, 0 ]; + } + $lat = $pieces[1]; + $lon = $pieces[2]; + } elseif ( count( $pieces ) === 2 ) { + $lat = $pieces[0]; + $lon = $pieces[1]; + } else { + return [ null, 0 ]; + } + + $globe = new Globe( self::DEFAULT_GLOBE ); + if ( !$globe->coordinatesAreValid( $lat, $lon ) ) { + return [ null, 0 ]; + } + + return [ + new Coord( floatval( $lat ), floatval( $lon ), $globe->getName() ), + $radius, + ]; + } + + /** + * @param string $distance + * @return int|null Parsed distance in meters, or null if unparsable + */ + public function parseDistance( $distance ) { + if ( !preg_match( '/^(\d+)(m|km|mi|ft|yd)$/', $distance, $matches ) ) { + return null; + } + + $scale = [ + 'm' => 1, + 'km' => 1000, + // Supported non-SI units, and their conversions, sourced from + // https://en.wikipedia.org/wiki/Unit_of_length#Imperial.2FUS + 'mi' => 1609.344, + 'ft' => 0.3048, + 'yd' => 0.9144, + ]; + + return max( 10, (int) round( $matches[1] * $scale[$matches[2]] ) ); + } + + /** + * Create a filter for near: and neartitle: queries. + * + * @param Coord $coord + * @param int $radius Search radius in meters + * @param string $docIdToExclude Document id to exclude, or "" for no exclusions. + * @return AbstractQuery + */ + public static function createQuery( Coord $coord, $radius, $docIdToExclude = '' ) { + $query = new \Elastica\Query\BoolQuery(); + $query->addFilter( new \Elastica\Query\Term( [ 'coordinates.globe' => $coord->globe ] ) ); + $query->addFilter( new \Elastica\Query\Term( [ 'coordinates.primary' => 1 ] ) ); + + $distanceFilter = new \Elastica\Query\GeoDistance( + 'coordinates.coord', + [ 'lat' => $coord->lat, 'lon' => $coord->lon ], + $radius . 'm' + ); + $distanceFilter->setOptimizeBbox( 'indexed' ); + $query->addFilter( $distanceFilter ); + + if ( $docIdToExclude !== '' ) { + $query->addMustNot( new \Elastica\Query\Term( [ '_id' => $docIdToExclude ] ) ); + } + + $nested = new \Elastica\Query\Nested(); + $nested->setPath( 'coordinates' )->setQuery( $query ); + + return $nested; + } + +} diff --git a/includes/CoordinatesIndexField.php b/includes/Search/CoordinatesIndexField.php similarity index 100% rename from includes/CoordinatesIndexField.php rename to includes/Search/CoordinatesIndexField.php diff --git a/includes/GeoPointIndexField.php b/includes/Search/GeoPointIndexField.php similarity index 100% rename from includes/GeoPointIndexField.php rename to includes/Search/GeoPointIndexField.php diff --git a/includes/Search/GeoRadiusFunctionScoreBuilder.php b/includes/Search/GeoRadiusFunctionScoreBuilder.php new file mode 100644 index 0000000..a1e14be --- /dev/null +++ b/includes/Search/GeoRadiusFunctionScoreBuilder.php @@ -0,0 +1,42 @@ +<?php +namespace GeoData; + +use CirrusSearch\Search\FunctionScoreBuilder; +use CirrusSearch\Search\SearchContext; +use Elastica\Query\FunctionScore; + +/** + * Builds a boost for documents based on geocoordinates. + * Initialized by special syntax in user query. + * @see CirrusGeoFeature + * @package GeoData + */ +class GeoRadiusFunctionScoreBuilder extends FunctionScoreBuilder { + /** + * Default feature weight + */ + const DEFAULT_WEIGHT = 2; + /** + * @var Coord + */ + private $coord; + /** + * @var int + */ + private $radius; + + public function __construct( SearchContext $context, $weight, Coord $coord, $radius ) { + $weight = $this->context->getConfig()->get( 'GeoDataRadiusScoreOverrides' ); + $weight['weight'] = self::DEFAULT_WEIGHT; + // Overrides will be applied to weight in parent ctor + parent::__construct( $context, $weight ); + $this->weight *= $weight; + $this->coord = $coord; + $this->radius = $radius; + } + + public function append( FunctionScore $functionScore ) { + $functionScore->addWeightFunction( $this->weight, + CirrusGeoFeature::createQuery( $this->coord, $this->radius ) ); + } +} diff --git a/tests/phpunit/GeoFeatureTest.php b/tests/phpunit/GeoFeatureTest.php new file mode 100644 index 0000000..2caf196 --- /dev/null +++ b/tests/phpunit/GeoFeatureTest.php @@ -0,0 +1,285 @@ +<?php + +namespace GeoData; + +use CirrusSearch\SearchConfig; +use LoadBalancer; +use IDatabase; +use MediaWiki\MediaWikiServices; +use MediaWikiTestCase; +use Title; + +/** + * Test GeoFeature functions. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @group GeoData + */ +class GeoFeatureTest extends MediaWikiTestCase { + + protected function setUp() { + parent::setUp(); + if ( !class_exists( 'CirrusSearch' ) ) { + $this->markTestSkipped( 'CirrusSearch not installed, skipping' ); + } + } + + public function parseDistanceProvider() { + return [ + 'unknown units returns null' => [ + null, + '100fur', + ], + 'gibberish returns null' => [ + null, + 'gibberish', + ], + 'no space allowed between numbers and units' => [ + null, + '100 m', + ], + 'meters' => [ + 100, + '100m', + ], + 'kilometers' => [ + 1000, + '1km', + ], + 'yards' => [ + 366, + '400yd', + ], + 'one mile rounds down' => [ + 1609, + '1mi', + ], + 'two miles rounds up' => [ + '3219', + '2mi', + ], + '1000 feet rounds up' => [ + 305, + '1000ft', + ], + '3000 feet rounds down' => [ + 914, + '3000ft', + ], + 'small requests are bounded' => [ + 10, + '1ft', + ], + 'allows large inputs' => [ + 4321000, + '4321km', + ], + ]; + } + + /** + * @dataProvider parseDistanceProvider + */ + public function testParseDistance( $expected, $distance ) { + $feature = new CirrusGeoFeature(); + $this->assertEquals( $expected, $feature->parseDistance( $distance ) ); + } + + public function parseGeoNearbyProvider() { + return [ + 'random input' => [ + [ null, 0 ], + 'gibberish' + ], + 'random input with comma' => [ + [ null, 0 ], + 'gibberish,42.42' + ], + 'random input with valid radius prefix' => [ + [ null, 0 ], + '20km,42.42,invalid', + ], + 'valid coordinate, default radius' => [ + [ + [ 'lat' => 1.2345, 'lon' => 2.3456 ], + 5000, + ], + '1.2345,2.3456', + ], + 'valid coordinate, specific radius in meters' => [ + [ + [ 'lat' => -5.4321, 'lon' => 42.345 ], + 4321, + ], + '4321m,-5.4321,42.345', + ], + 'valid coordinate, specific radius in kilmeters' => [ + [ + [ 'lat' => 0, 'lon' => 42.345 ], + 7000, + ], + '7km,0,42.345', + ], + 'out of bounds positive latitude' => [ + [ null, 0 ], + '90.1,0' + ], + 'out of bounds negative latitude' => [ + [ null, 0 ], + '-90.1,17', + ], + 'out of bounds positive longitude' => [ + [ null, 0 ], + '49,180.1', + ], + 'out of bounds negative longitude' => [ + [ null, 0 ], + '49,-180.001', + ], + 'valid coordinate with spaces' => [ + [ + [ 'lat' => 1.2345, 'lon' => 9.8765 ], + 5000 + ], + '1.2345, 9.8765' + ], + ]; + } + + /** + * @dataProvider parseGeoNearbyProvider + */ + public function testParseGeoNearby( $expected, $value ) { + $feature = new CirrusGeoFeature(); + $result = $feature->parseGeoNearby( $value ); + if ( $result[0] instanceof Coord ) { + $result[0] = [ 'lat' => $result[0]->lat, 'lon' => $result[0]->lon ]; + } + $this->assertEquals( $expected, $result ); + } + + public function parseGeoNearbyTitleProvider() { + return [ + 'basic page lookup' => [ + [ + [ 'lat' => 1.2345, 'lon' => 5.4321 ], + 5000, + 7654321, + ], + 'San Francisco' + ], + 'basic page lookup with radius in meters' => [ + [ + [ 'lat' => 1.2345, 'lon' => 5.4321 ], + 1234, + 7654321, + ], + '1234m,San Francisco' + ], + 'basic page lookup with radius in kilometers' => [ + [ + [ 'lat' => 1.2345, 'lon' => 5.4321 ], + 2000, + 7654321, + ], + '2km,San Francisco' + ], + 'basic page lookup with space between radius and name' => [ + [ + [ 'lat' => 1.2345, 'lon' => 5.4321 ], + 2000, + 7654321, + ], + '2km, San Francisco' + ], + 'page with comma in name' => [ + [ + [ 'lat' => 1.2345, 'lon' => 5.4321 ], + 5000, + 1234567, + ], + 'Washington, D.C.' + ], + 'page with comma in name and radius in kilometers' => [ + [ + [ 'lat' => 1.2345, 'lon' => 5.4321 ], + 7000, + 1234567, + ], + '7km,Washington, D.C.' + ], + 'unknown page lookup' => [ + [ null, 0, '' ], + 'Unknown Title', + ], + 'unknown page lookup with radius' => [ + [ null, 0, '' ], + '4km, Unknown Title', + ], + ]; + } + + /** + * @dataProvider parseGeoNearbyTitleProvider + */ + public function testParseGeoNearbyTitle( $expected, $value ) { + // Replace database with one that will return our fake coordinates if asked + $db = $this->getMock( IDatabase::class ); + $db->expects( $this->any() ) + ->method( 'select' ) + ->with( 'geo_tags', $this->anything(), $this->anything(), $this->anything() ) + ->will( $this->returnValue( [ + (object) [ 'gt_lat' => 1.2345, 'gt_lon' => 5.4321 ], + ] ) ); + // Tell LinkCache all titles not explicitly added don't exist + $db->expects( $this->any() ) + ->method( 'selectRow' ) + ->with( 'page', $this->anything(), $this->anything(), $this->anything() ) + ->will( $this->returnValue( false ) ); + // Inject mock database into a mock LoadBalancer + $lb = $this->getMockBuilder( LoadBalancer::class ) + ->disableOriginalConstructor() + ->getMock(); + $lb->expects( $this->any() ) + ->method( 'getConnection' ) + ->will( $this->returnValue( $db ) ); + $this->setService( 'DBLoadBalancer', $lb ); + + // Inject fake San Francisco page into LinkCache so it "exists" + MediaWikiServices::getInstance()->getLinkCache() + ->addGoodLinkObj( 7654321, Title::newFromText( 'San Francisco' ) ); + // Inject fake page with comma in it as well + MediaWikiServices::getInstance()->getLinkCache() + ->addGoodLinkObj( 1234567, Title::newFromText( 'Washington, D.C.' ) ); + + $config = $this->getMock( SearchConfig::class ); + $config->expects( $this->any() ) + ->method( 'makeId' ) + ->will( $this->returnCallback( function ( $id ) { + return $id; + } ) ); + + // Finally run the test + $feature = new CirrusGeoFeature(); + $result = $feature->parseGeoNearbyTitle( $config, $value ); + if ( $result[0] instanceof Coord ) { + $result[0] = [ 'lat' => $result[0]->lat, 'lon' => $result[0]->lon ]; + } + + $this->assertEquals( $expected, $result ); + } +} -- To view, visit https://gerrit.wikimedia.org/r/326039 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I24eb69b1aa91c528a230432164f6441593d95365 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/GeoData Gerrit-Branch: master Gerrit-Owner: Smalyshev <smalys...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits