Mschwarzer has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/329626 )
Change subject: Add Citolytics query prefix ...................................................................... Add Citolytics query prefix Additional query prefix (citolytics:) allows retrieval of link-based article recommendations via Citolytics. Recommendations are stored in a separate Elasticsearch index (citolytics_content). New class CirrusSearch\Search\CitolyticsResultsType handles transformation to article result sets. The Citolytics project should improve the mobile recommendations. Issue: T142477 Change-Id: I7525eef60c60ce747d194321c552a3df22d96d8f --- M autoload.php A includes/Query/CitolyticsFeature.php M includes/Query/FullTextQueryStringQueryBuilder.php M includes/Search/ResultsType.php M includes/Search/SearchContext.php M includes/Searcher.php A tests/unit/Query/CitolyticsFeatureTest.php 7 files changed, 302 insertions(+), 2 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/26/329626/1 diff --git a/autoload.php b/autoload.php index 33c806c..6d838ad 100644 --- a/autoload.php +++ b/autoload.php @@ -186,4 +186,7 @@ 'CirrusSearch\\UserTesting' => __DIR__ . '/includes/UserTesting.php', 'CirrusSearch\\Util' => __DIR__ . '/includes/Util.php', 'CirrusSearch\\Version' => __DIR__ . '/includes/Version.php', + 'CirrusSearch\\Search\\CitolyticsResultsType' => __DIR__ . '/includes/Search/ResultsType.php', + 'CirrusSearch\\Query\\CitolyticsFeature' => __DIR__ . '/includes/Query/CitolyticsFeature.php', + ]; diff --git a/includes/Query/CitolyticsFeature.php b/includes/Query/CitolyticsFeature.php new file mode 100644 index 0000000..f230daa --- /dev/null +++ b/includes/Query/CitolyticsFeature.php @@ -0,0 +1,37 @@ +<?php + +namespace CirrusSearch\Query; + +use CirrusSearch\Search\CitolyticsResultsType; +use CirrusSearch\Search\SearchContext; + +class CitolyticsFeature implements KeywordFeature { + /** @const string query prefix that triggers Citolytics */ + const CITOLYTICS_PREFIX = 'citolytics:'; + + const CITOLYTICS_FIELD = 'title'; + const CITOLYTICS_INDEX_BASE = 'citolytics'; + + /** + * Greedily match the entire $term as a citolytics query. + * + * @param SearchContext $context + * @param string $term + * @return string + */ + public function apply( SearchContext $context, $term ) { + if ( substr( $term, 0, strlen( self::CITOLYTICS_PREFIX ) ) === self::CITOLYTICS_PREFIX ) { + + $term = substr( $term, strlen( self::CITOLYTICS_PREFIX ) ); + $context->setExtraIndexBaseName( self::CITOLYTICS_INDEX_BASE ); + $context->setExtraResultsType( new CitolyticsResultsType() ); + + $query = new \Elastica\Query\Match( self::CITOLYTICS_FIELD, $term ); + $context->setMainQuery( $query ); + + return ''; + } + + return $term; + } +} \ No newline at end of file diff --git a/includes/Query/FullTextQueryStringQueryBuilder.php b/includes/Query/FullTextQueryStringQueryBuilder.php index 2374fca..789d667 100644 --- a/includes/Query/FullTextQueryStringQueryBuilder.php +++ b/includes/Query/FullTextQueryStringQueryBuilder.php @@ -53,6 +53,11 @@ $term = $feature->apply( $searchContext, $term ); } + // Skip if query was already set by a feature + if ( !$searchContext->getQuery() instanceof \Elastica\Query\MatchAll ) { + return; + } + if ( !$searchContext->areResultsPossible() ) { return; } diff --git a/includes/Search/ResultsType.php b/includes/Search/ResultsType.php index 38780a3..33aff08 100644 --- a/includes/Search/ResultsType.php +++ b/includes/Search/ResultsType.php @@ -640,3 +640,70 @@ return new EmptyResultSet(); } } + +/** + * Result type for a Citolytics search. + */ +class CitolyticsResultsType implements ResultsType { + /** + * @return false|string|array corresponding to Elasticsearch source filtering syntax + */ + public function getSourceFiltering() { + return [ 'id', 'title', 'namespace', 'redirect.*', 'timestamp', 'text_bytes', 'related_content' ]; + } + + /** + * @return string + */ + public function getFields() { + return array(); // all data is stored in source field. + } + + /** + * @param array $highlightSource + * @return array|null + */ + public function getHighlightingConfiguration( array $highlightSource ) { + return null; + } + + /** + * Citolytics recommendations are stored as array in the citolytics_content index. Array elements need to be transformed + * to regular ES result set to be accessible via CirrusSearch as individual search results. + * + * @param SearchContext $context + * @param \Elastica\ResultSet $result + * @return ResultSet + */ + public function transformElasticsearchResult( SearchContext $context, \Elastica\ResultSet $result ) { + $docs = $result->getDocuments(); + $key_value_index = 0; // If multiple-index queries are used + + // Check for empty results + if ( count( $docs ) < 1 || !isset( $docs[$key_value_index] ) || !isset( $docs[$key_value_index]->getData()['related_content'] ) || count( $docs[$key_value_index]->getData()['related_content'] ) < 1 ) { + return new EmptyResultSet(); + } + + $relatedContent = $docs[$key_value_index]->getData()['related_content']; + // Overwrite hits in original response with names + $overwritten = $result->getResponse()->getData(); + $overwritten['hits'] = array( 'total' => count( $relatedContent ), // Needs to be set for pagination + 'max_score' => 1, 'hits' => array() ); + // Generate artificial search results + foreach ( $relatedContent as $data ) { + $overwritten['hits']['hits'][] = array( '_score' => 1, '_source' => array( 'title' => $data['title'], 'namespace' => 0 ), 'fields' => array(), ); + } + // Build new result set from original query and overwritten reponse + $response = new \Elastica\Response( $overwritten, $result->getResponse()->getStatus() ); + $result = \Elastica\ResultSet::create( $response, $result->getQuery() ); + + return new ResultSet( $context->getSuggestPrefixes(), $context->getSuggestSuffixes(), $result, $context->isSyntaxUsed(), $context->getConfig() ); + } + + /** + * @return EmptyResultSet + */ + public function createEmptyResult() { + return new EmptyResultSet(); + } +} \ No newline at end of file diff --git a/includes/Search/SearchContext.php b/includes/Search/SearchContext.php index 380edc4..725819e 100644 --- a/includes/Search/SearchContext.php +++ b/includes/Search/SearchContext.php @@ -720,4 +720,37 @@ $this->extraScoreBuilders[] = $rescore; } + /** + * @var string + */ + private $extraIndexBaseName; + + /** + * @var ResultsType + */ + private $extraResultsType; + + public function hasExtraIndexBaseName() { + return isset($this->extraIndexBaseName); + } + + public function getExtraIndexBaseName() { + return $this->extraIndexBaseName; + } + + public function setExtraIndexBaseName($indexBaseName) { + $this->extraIndexBaseName = $indexBaseName; + } + + public function hasExtraResultsType() { + return isset($this->extraResultsType); + } + + public function getExtraResultsType() { + return $this->extraResultsType; + } + + public function setExtraResultsType($resultsType) { + $this->extraResultsType = $resultsType; + } } diff --git a/includes/Searcher.php b/includes/Searcher.php index 14a5cf7..e9f60c4 100644 --- a/includes/Searcher.php +++ b/includes/Searcher.php @@ -3,6 +3,7 @@ namespace CirrusSearch; use CirrusSearch\Query\SimpleKeywordFeature; +use CirrusSearch\Search\CitolyticsResultsType; use CirrusSearch\Search\FullTextResultsType; use CirrusSearch\Search\ResultsType; use CirrusSearch\Search\RescoreBuilder; @@ -301,6 +302,8 @@ // very first item until combining with other queries // is worked out. new Query\MoreLikeFeature( $this->config ), + // Handle Citolytics prefix (greedy) + new Query\CitolyticsFeature(), // Handle title prefix notation (greedy) new Query\PrefixFeature(), // Handle prefer-recent keyword @@ -348,8 +351,6 @@ $builderSettings['settings'] ); - - if ( !( $qb instanceof FullTextQueryBuilder ) ) { throw new RuntimeException( "Bad builder class configured: {$builderSettings['builder_class']}" ); } @@ -376,6 +377,14 @@ $qb = $this->buildFullTextSearch( $term, $showSuggestion ); + if ( $this->searchContext->hasExtraIndexBaseName() ) { + $this->indexBaseName = $this->searchContext->getExtraIndexBaseName(); + } + + if ( $this->searchContext->hasExtraResultsType() ) { + $this->setResultsType( $this->searchContext->getExtraResultsType() ); + } + $status = $this->searchOne(); if ( !$status->isOK() && ElasticaErrorHandler::isParseError( $status ) ) { if ( $qb->buildDegraded( $this->searchContext ) ) { diff --git a/tests/unit/Query/CitolyticsFeatureTest.php b/tests/unit/Query/CitolyticsFeatureTest.php new file mode 100644 index 0000000..4389b47 --- /dev/null +++ b/tests/unit/Query/CitolyticsFeatureTest.php @@ -0,0 +1,146 @@ +<?php + +namespace CirrusSearch\Query; + +use CirrusSearch\CirrusTestCase; +use CirrusSearch\Connection; +use CirrusSearch\Search\CitolyticsResultsType; +use CirrusSearch\Search\SearchContext; +use MediaWiki\MediaWikiServices; + +/** + * Test Citolytics keyword feature. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @group CirrusSearch + */ +class CitolyticsFeatureTest extends CirrusTestCase { + /** + * @var \Elastica\Client + */ + private $client; + + /** + * @var \CirrusSearch\SearchConfig + */ + private $config; + + protected $testDataIds = [ ]; + protected $testData = [ array( 'title' => 'Foo', 'category' => 'Some category', 'namespace' => 0, 'related_content' => [ array( 'title' => 'Some related page', 'score' => 0.9 ), array( 'title' => 'Another related page', 'score' => 0.8 ) ] ), array( 'title' => 'Bar', 'category' => 'Some category', 'namespace' => 0, 'related_content' => [ array( 'title' => 'Foo', 'score' => 0.9 ), array( 'title' => 'Foo page', 'score' => 0.8 ), array( 'title' => 'Foo related', 'score' => 0.3 ), array( 'title' => 'Foo Bar', 'score' => 0.1 ), ] ), ]; + + private function getPath() { + return CitolyticsFeature::CITOLYTICS_INDEX_BASE . '_content/page'; + } + + protected function setUp() { + parent::setUp(); + + $this->config = MediaWikiServices::getInstance()->getConfigFactory()->makeConfig( 'CirrusSearch' ); + $this->client = ( new Connection( $this->config ) )->getClient(); + + $this->insertTestData(); + } + + protected function tearDown() { + $this->removeTestData(); + parent::tearDown(); + } + + private function insertTestData() { + foreach ( $this->testData as $data ) { + $response = $this->client->request( $this->getPath(), \Elastica\Request::POST, $data ); + + $this->assertTrue( $response->isOk(), 'Creating test data failed' ); + + $this->testDataIds[] = $response->getData()['_id']; + } + } + + private function removeTestData() { + foreach ( $this->testDataIds as $id ) { + $response = $this->client->request( $this->getPath() . '/' . $id, \Elastica\Request::DELETE ); + + $this->assertTrue( $response->isOk(), 'Removing test data failed' ); + } + $this->testDataIds = [ ]; + } + + /** + * Data provider for testApply + * + * @return array data for testApply + */ + public function applyProvider() { + return [ 'unrelated queries' => [ 'some query', new \Elastica\Query\MatchAll(), null, null ], 'citolytics query for some page' => [ CitolyticsFeature::CITOLYTICS_PREFIX . 'Some Title', new \Elastica\Query\Match( CitolyticsFeature::CITOLYTICS_FIELD, 'Some Title' ), CitolyticsFeature::CITOLYTICS_INDEX_BASE, new CitolyticsResultsType() ], ]; + } + + /** + * Tests CitolyticsFeature + * + * @dataProvider applyProvider + */ + public function testApply( $term, $expectedQuery, $expectedIndexBaseName, $expectedResultsType ) { + + $context = new SearchContext( $this->config ); + + // Finally run the test + $feature = new CitolyticsFeature(); + + $result = $feature->apply( $context, $term ); + + // Assert query + if ( $expectedQuery === null ) { + $this->assertFalse( $context->areResultsPossible() ); + } else { + $this->assertEquals( $expectedQuery, $context->getQuery() ); + if ( $expectedQuery instanceof \Elastica\Query\MatchAll ) { + $this->assertEquals( $term, $result, 'Term must be unchanged' ); + } else { + $this->assertEquals( '', $result, 'Term must be empty string' ); + } + } + + // Assert IndexBase + if ( $expectedIndexBaseName === null ) { + $this->assertFalse( $context->hasExtraIndexBaseName() ); + } else { + $this->assertEquals( $expectedIndexBaseName, $context->getExtraIndexBaseName(), 'IndexBaseName must be changed' ); + } + + // Assert ResultsType + if ( $expectedResultsType === null ) { + $this->assertFalse( $context->hasExtraResultsType() ); + } else { + $this->assertEquals( $expectedResultsType, $context->getExtraResultsType() ); + } + } + + /** + * Tests Citolytics search and result transformation with testData + */ + public function testSearcherAndResultsType() { + $engine = new \CirrusSearch(); + + foreach ( $this->testData as $data ) { + // Query with prefix and title + $status = $engine->searchText( CitolyticsFeature::CITOLYTICS_PREFIX . $data['title'] ); + + // Validate number of hits transformed by CitolyticsResultsType + $this->assertEquals( count( $data['related_content'] ), $status->getValue()->getTotalHits(), 'Invalid number of hits' ); + } + } +} -- To view, visit https://gerrit.wikimedia.org/r/329626 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I7525eef60c60ce747d194321c552a3df22d96d8f Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/CirrusSearch Gerrit-Branch: master Gerrit-Owner: Mschwarzer <wikit...@i.mieo.de> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits