DCausse has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/342820 )
Change subject: Implement RT updates for completion
......................................................................
Implement RT updates for completion
The approach is to hook a new job at the of the LinksUpdate job.
This job is triggered only if the page affects a namespace supported
by the completion suggester.
Some additional params are added to LinksUpdate to track deletion
of redirects.
The process relies mostly on super_detect_noop to avoid too many updates
because I could not find a way to detect if the updated page is new.
If the super_detect_noop is not activated RT updates are deactivated.
Added a new param to all jobs to indicate if the job was triggered by
a maintenance script.
Bug: T154503
Change-Id: I2ceaa065b51d79690f03f123fd1771e500ffb93f
---
M CirrusSearch.php
M autoload.php
M docs/settings.txt
M includes/BuildDocument/Completion/SuggestBuilder.php
M includes/Connection.php
M includes/DataSender.php
M includes/Hooks.php
A includes/Job/CompletionSuggesterDeleteJob.php
A includes/Job/CompletionSuggesterUpdateJob.php
M includes/Job/Job.php
M includes/Job/LinksUpdate.php
M includes/Job/MassIndex.php
M includes/Job/OtherIndex.php
M includes/OtherIndexes.php
M includes/Sanity/QueueingRemediator.php
M includes/Updater.php
M maintenance/forceSearchIndex.php
M maintenance/updateSuggesterIndex.php
M tests/browser/features/step_definitions/search_steps.rb
M tests/browser/features/suggest_api.feature
M tests/browser/features/support/hooks.rb
M tests/jenkins/FullyFeaturedConfig.php
M tests/unit/SuggestBuilderTest.php
23 files changed, 1,138 insertions(+), 128 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch
refs/changes/20/342820/1
diff --git a/CirrusSearch.php b/CirrusSearch.php
index 613839d..8dab83a 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -845,6 +845,11 @@
$wgCirrusSearchRecycleCompletionSuggesterIndex = true;
/**
+ * Realtime update options for the completion suggester
+ */
+$wgCirrusSearchCompletionSuggesterRTOptions = [];
+
+/**
* Enable alternative language search.
*/
$wgCirrusSearchEnableAltLanguage = false;
@@ -1138,6 +1143,8 @@
$wgJobClasses[ 'cirrusSearchOtherIndex' ] = 'CirrusSearch\Job\OtherIndex';
$wgJobClasses[ 'cirrusSearchElasticaWrite' ] =
'CirrusSearch\Job\ElasticaWrite';
$wgJobClasses[ 'cirrusSearchCheckerJob' ] = 'CirrusSearch\Job\CheckerJob';
+$wgJobClasses[ 'cirrusSearchCompletionSuggesterUpdateJob' ] =
'CirrusSearch\Job\CompletionSuggesterUpdateJob';
+$wgJobClasses[ 'cirrusSearchCompletionSuggesterDeleteJob' ] =
'CirrusSearch\Job\CompletionSuggesterDeleteJob';
/**
* Actions
diff --git a/autoload.php b/autoload.php
index 095fbb7..4db117c 100644
--- a/autoload.php
+++ b/autoload.php
@@ -59,6 +59,8 @@
'CirrusSearch\\Iterator\\CallbackIterator' => __DIR__ .
'/includes/iterator/CallbackIterator.php',
'CirrusSearch\\Iterator\\IteratorDecorator' => __DIR__ .
'/includes/iterator/IteratorDecorator.php',
'CirrusSearch\\Job\\CheckerJob' => __DIR__ .
'/includes/Job/CheckerJob.php',
+ 'CirrusSearch\\Job\\CompletionSuggesterDeleteJob' => __DIR__ .
'/includes/Job/CompletionSuggesterDeleteJob.php',
+ 'CirrusSearch\\Job\\CompletionSuggesterUpdateJob' => __DIR__ .
'/includes/Job/CompletionSuggesterUpdateJob.php',
'CirrusSearch\\Job\\DeletePages' => __DIR__ .
'/includes/Job/DeletePages.php',
'CirrusSearch\\Job\\ElasticaWrite' => __DIR__ .
'/includes/Job/ElasticaWrite.php',
'CirrusSearch\\Job\\IncomingLinkCount' => __DIR__ .
'/includes/Job/IncomingLinkCount.php',
diff --git a/docs/settings.txt b/docs/settings.txt
index f3663a8..4e6c875 100644
--- a/docs/settings.txt
+++ b/docs/settings.txt
@@ -1160,6 +1160,21 @@
documents are indexed and optimised with replication disabled
reducing the number of disk operation to primary shards only.
+; $wgCirrusSearchCompletionSuggesterRTOptions
+
+Default:
+ $wgCirrusSearchCompletionSuggesterRTOptions = []
+
+Enable Realtime updates of the completion suggester indices.
+
+Example:
+
+ $wgCirrusSearchCompletionSuggesterRTOptions = [
+ 'realtime_updates' => true,
+ // Lower values make search more real time but put more load on the
cluster
+ 'refresh_interval' => '60s',
+ ];
+
; $wgCirrusSearchEnableAltLanguage
Default:
diff --git a/includes/BuildDocument/Completion/SuggestBuilder.php
b/includes/BuildDocument/Completion/SuggestBuilder.php
index 642d1af..3290f66 100644
--- a/includes/BuildDocument/Completion/SuggestBuilder.php
+++ b/includes/BuildDocument/Completion/SuggestBuilder.php
@@ -2,6 +2,12 @@
namespace CirrusSearch\BuildDocument\Completion;
+use CirrusSearch\Connection;
+use CirrusSearch\Elastica\MultiSearch as MultiSearch;
+use CirrusSearch\SearchConfig;
+use Elastica;
+use Elastica\Query;
+use Elastica\Document;
use Title;
use LinkBatch;
@@ -96,13 +102,158 @@
private $targetNamespace = NS_MAIN;
/**
+ * @var bool true to indicate that realtime updates are accepted
+ */
+ private $rtUpdate = false;
+
+ /**
* @param SuggestScoringMethod $scoringMethod the scoring function to
use
* @param ExtraSuggestionsBuilder[] $extraBuilders set of extra builders
+ * @param bool $rtUpdate set to true to accept real-time updates
*/
- public function __construct( SuggestScoringMethod $scoringMethod, array
$extraBuilders = [] ) {
+ public function __construct( SuggestScoringMethod $scoringMethod, array
$extraBuilders = [], $rtUpdate = false ) {
$this->scoringMethod = $scoringMethod;
$this->extraBuilders = $extraBuilders;
$this->batchId = time();
+ $this->rtUpdate = $rtUpdate;
+ }
+
+ /**
+ * Create a completion suggester builder
+ *
+ * @param SearchConfig $config
+ * @param string|null $scoringMethodName force the scoring method
+ * @return SuggestBuilder
+ */
+ public static function createBuilder( SearchConfig $config,
$scoringMethodName = null ) {
+ $extraBuilders = [];
+ if( $config->get(
'CirrusSearchCompletionSuggesterUseDefaultSort' ) ) {
+ $extraBuilders[] = new DefaultSortSuggestionsBuilder();
+ }
+ $subPhrasesConfig = $config->get(
'CirrusSearchCompletionSuggesterSubphrases' );
+ if( $subPhrasesConfig['build'] ) {
+ $extraBuilders[] =
NaiveSubphrasesSuggestionsBuilder::create( $subPhrasesConfig );
+ }
+ if ( is_null( $scoringMethodName ) ) {
+ $scoringMethodName = $config->get(
'CirrusSearchCompletionDefaultScore' );
+ }
+
+ $rtUpdate = $config->getElement(
'CirrusSearchCompletionSuggesterRTOptions', 'realtime_updates' ) === true;
+ return new SuggestBuilder(
+ SuggestScoringMethodFactory::getScoringMethod(
$scoringMethodName ),
+ $extraBuilders,
+ $rtUpdate
+ );
+ }
+
+ /**
+ * @return string[] list of source index types.
+ */
+ public function getSourceIndexTypes() {
+ return [ Connection::CONTENT_INDEX_TYPE,
Connection::GENERAL_INDEX_TYPE ];
+ }
+
+ /**
+ * Builds the source query issued to main cirrus indices to retrieve
+ * input documents.
+ *
+ * @return \Elastica\Query
+ */
+ public function buildSourceQuery() {
+ $query = new Query();
+ $query->setSource( [
+ 'includes' => $this->getRequiredFields()
+ ] );
+
+ $pageAndNs = new Elastica\Query\BoolQuery();
+ $pageAndNs->addShould( new Elastica\Query\Term( [ "namespace"
=> $this->targetNamespace ] ) );
+ $pageAndNs->addShould( new Elastica\Query\Term( [
"redirect.namespace" => $this->targetNamespace ] ) );
+ $pageAndNs->addMust( new Elastica\Query\Type(
Connection::PAGE_TYPE_NAME ) );
+ $bool = new Elastica\Query\BoolQuery();
+ $bool->addFilter( $pageAndNs );
+
+ $query->setQuery( $bool );
+ return $query;
+ }
+
+ /**
+ * Builds a query to retrieve existing documents in the completion
+ * suggester index that have been created from $sourceDocId
+ * @param string[] $sourceDocId
+ * @param bool $withSource
+ * @return Query
+ */
+ public function buildSuggestRetrievalQuery( array $sourceDocIds,
$withSource ) {
+ $ids = new Elastica\Query\Terms(
+ 'source_doc_id',
+ $sourceDocIds
+ );
+ $query = new Query( $ids );
+ // In general we'll extract 1 or 2 docs
+ // We set 1024 here to handle the worst case scenario
+ // where 1024 cross-ns redirects were created to a single
+ // doc.
+ $query->setSize( min( 10000, count( $sourceDocIds ) * 1024 ) );
+ $query->setSource( $withSource );
+ return $query;
+ }
+
+ /**
+ * Check if RT updates are enabled and if the provided title
+ * may affect existing suggestiosn.
+ *
+ * @param Title $title
+ * @return bool true if this title can trigger a RT update
+ */
+ public function acceptRTUpdate( Title $title ) {
+ if ( !$this->rtUpdate ) {
+ return false;
+ }
+ // Only title or redirects that match the targetNamespace
+ // can trigger a modification
+ return $title->getNamespace() === $this->targetNamespace;
+ }
+
+ /**
+ * Preparation phase, inspect the number of documents in the source
+ * indices for normalization purposes.
+ * @param string $indexBaseName
+ * @param Connection $connection
+ */
+ public function prepare( $indexBaseName, Connection $connection ) {
+ $cache = \ObjectCache::getLocalClusterInstance();
+ $cacheKey = $cache->makeKey(
'cirrussearch-scoring-maxdocs-content' );
+ $total = $cache->getWithSetCallback(
+ $cacheKey,
+ 600,
+ function() use ( $indexBaseName, $connection ) {
+ // Indices to use for counting max_docs used by
scoring functions
+ // Since we work mostly on the content
namespace it seems OK to count
+ // only docs in the CONTENT index.
+ $countIndices = [
Connection::CONTENT_INDEX_TYPE ];
+
+ // Run a first query to count the number of
docs.
+ // This is needed for the scoring methods that
need
+ // to normalize values against wiki size.
+ $mSearch = new MultiSearch(
$connection->getClient() );
+ foreach ( $countIndices as $sourceIndexType ) {
+ $search = new \Elastica\Search(
$connection->getClient() );
+ $search->setQuery(
$this->buildSourceQuery() );
+ $search->addIndex(
$connection->getIndex( $indexBaseName, $sourceIndexType ) );
+ $search->getQuery()->setSize( 0 );
+ $mSearch->addSearch( $search );
+ }
+
+ $mSearchRes = $mSearch->search();
+ $total = 0;
+ foreach( $mSearchRes as $res ) {
+ $total += $res->getTotalHits();
+ }
+ return $total;
+ }
+ );
+
+ $this->scoringMethod->setMaxDocs( $total );
}
/**
@@ -178,6 +329,86 @@
}
}
return $docs;
+ }
+
+ /**
+ * Identifies which suggest documents needs to be
+ * updated/deleted.
+ * Only the input of each suggest field is analyzed,
+ * weigths and other fields are ignored.
+ *
+ * @param \Elastica\Document[] $newDocs newly created docs
+ * @param \Elastica\Document[] $existingDocs existing docs
+ * @return \Elastica\Document[][] Documents to update under
+ * the key 'to_update' key and documents to delete under
+ * the 'to_delete' key
+ */
+ public function diff( array $newDocs, array $existingDocs ) {
+ $newById = [];
+ foreach( $newDocs as $doc ) {
+ $newById[$doc->getId()] = $doc;
+ }
+ $oldById = [];
+ foreach( $existingDocs as $doc ) {
+ $oldById[$doc->getId()] = $doc;
+ }
+ $toUpdate = [];
+ $toDelete = [];
+ foreach( $newById as $id => $doc ) {
+ if ( !isset( $oldById[$id] ) ||
+ $this->needsUpdate( $doc, $oldById[$id] )
+ ) {
+ $toUpdate[] = $doc;
+ }
+ }
+
+ foreach( $oldById as $id => $doc ) {
+ if ( !isset( $newById[$id] ) ) {
+ $toDelete[] = $doc;
+ }
+ }
+
+ return [
+ 'to_delete' => $toDelete,
+ 'to_update' => $toUpdate,
+ ];
+ }
+
+ /**
+ * Detects if new inputs are added/removed in $newDoc compared
+ * to $existingDoc
+ *
+ * @param $newDoc
+ * @param $existingDoc
+ * @return bool true if the existing doc needs to be updated
+ */
+ private function needsUpdate( Document $newDoc, Document $existingDoc )
{
+ $existingData = $existingDoc->getData();
+ foreach( $newDoc->getData() as $key => $data ) {
+ if ( isset( $data['input'] ) && is_array(
$data['input'] ) ) {
+ if ( !isset( $existingData[$key]['input'] )
+ || !is_array(
$existingData[$key]['input'] )
+ ) {
+ // new docs include a suggest field
missing
+ // in the existing doc
+ return true;
+ }
+ $countIntersection = count( array_intersect(
+ $data['input'],
+ $existingData[$key]['input']
+ ) );
+ $countOrig = count( $data['input'] );
+ if ( $countOrig !== $countIntersection ) {
+ // inputs differ update is needed.
+ return true;
+ }
+ } else if( isset( $existingData[$key]['input'] ) ) {
+ // $existing doc has suggestions but they
disappeared
+ // from the input doc
+ return true;
+ }
+ }
+ return false;
}
/**
@@ -450,19 +681,6 @@
*/
public static function encodeDocId( $suggestionType, $docId ) {
return $docId . $suggestionType;
- }
-
- /**
- * Encode possible docIds used by the completion suggester index
- *
- * @param string $docId
- * @return string[] list of docIds
- */
- public static function encodePossibleDocIds( $docId ) {
- return [
- self::encodeDocId( self::TITLE_SUGGESTION, $docId ),
- self::encodeDocId( self::REDIRECT_SUGGESTION, $docId ),
- ];
}
/**
diff --git a/includes/Connection.php b/includes/Connection.php
index 454e884..d3f27be 100644
--- a/includes/Connection.php
+++ b/includes/Connection.php
@@ -191,7 +191,17 @@
* @return \Elastica\Type
*/
public function getPageType( $name, $type = false ) {
- return $this->getIndex( $name, $type )->getType(
self::PAGE_TYPE_NAME );
+ return $this->getIndexType( $name, $type, self::PAGE_TYPE_NAME
);
+ }
+
+ /**
+ * Fetch the Elastica Type for pages.
+ * @param mixed $name basename of index
+ * @param mixed $type type of index (content or general or false to get
all)
+ * @return \Elastica\Type
+ */
+ public function getIndexType( $name, $cirrusType, $elasticType ) {
+ return $this->getIndex( $name, $cirrusType )->getType(
$elasticType );
}
/**
diff --git a/includes/DataSender.php b/includes/DataSender.php
index adb192f..b3d6979 100644
--- a/includes/DataSender.php
+++ b/includes/DataSender.php
@@ -161,7 +161,7 @@
* @param (\Elastica\Script|\Elastica\Document)[] $data documents to
send
* @return Status
*/
- public function sendData( $indexType, $data ) {
+ public function sendData( $indexType, $data, $elasticType =
Connection::PAGE_TYPE_NAME ) {
$documentCount = count( $data );
if ( $documentCount === 0 ) {
return Status::newGood();
@@ -175,7 +175,7 @@
$responseSet = null;
$justDocumentMissing = false;
try {
- $pageType = $this->connection->getPageType(
$this->indexBaseName, $indexType );
+ $pageType = $this->connection->getIndexType(
$this->indexBaseName, $indexType, $elasticType );
$this->start( new BulkUpdateRequestLog(
$this->connection->getClient(),
'sending {numBulk} documents to the {index}
index(s)',
@@ -205,6 +205,7 @@
$validResponse = $responseSet !== null && count(
$responseSet->getBulkResponses() ) > 0;
if ( $exception === null && ( $justDocumentMissing ||
$validResponse ) ) {
$this->success();
+ $this->reportUpdateMetrics( $responseSet, count( $data
) );
return Status::newGood();
} else {
$this->failure( $exception );
@@ -220,17 +221,53 @@
}
/**
+ * @param \Elastica\Bulk\ResponseSet $responseSet
+ */
+ private function reportUpdateMetrics( \Elastica\Bulk\ResponseSet
$responseSet, $sent ) {
+ $updateStats = [
+ 'sent' => $sent,
+ ];
+ $allowedOps = [ 'created', 'updated', 'noop' ];
+ foreach( $responseSet->getBulkResponses() as $bulk ) {
+ $opRes = 'unknown';
+ if ( $bulk instanceof \Elastica\Bulk\Response ) {
+ if ( isset( $bulk->getData()['result'] )
+ && in_array(
$bulk->getData()['result'], $allowedOps )
+ ) {
+ $opRes = $bulk->getData()['result'];
+ }
+ }
+ if ( isset ( $updateStats[$opRes] ) ) {
+ $updateStats[$opRes]++;
+ } else {
+ $updateStats[$opRes] = 1;
+ }
+ }
+ $stats =
\MediaWiki\MediaWikiServices::getInstance()->getStatsdDataFactory();
+ $cluster = $this->connection->getClusterName();
+ $metricsPrefix = "CirrusSearch.$cluster.updates";
+ foreach( $updateStats as $what => $num ) {
+ $stats->updateCount(
"$metricsPrefix.details.{$this->indexBaseName}.$indexType.$what", $num );
+ $stats->updateCount( "$metricsPrefix.all.$what", $num );
+ }
+ }
+
+ /**
* Send delete requests to Elasticsearch.
*
* @param string[] $docIds elasticsearch document ids to delete
* @param string|null $indexType index from which to delete. null
means all.
* @return Status
*/
- public function sendDeletes( $docIds, $indexType = null ) {
+ public function sendDeletes( $docIds, $indexType = null, $elasticType =
null ) {
if ( $indexType === null ) {
$indexes = $this->connection->getAllIndexTypes();
} else {
$indexes = [ $indexType ];
+ }
+
+ if ( $elasticType === null ) {
+ $elasticType = Connection::PAGE_TYPE_NAME;
}
if ( !$this->areIndexesAvailableForWrites( $indexes ) ) {
@@ -241,11 +278,14 @@
if ( $idCount !== 0 ) {
try {
foreach ( $indexes as $indexType ) {
- $this->startNewLog( 'deleting {numIds}
from {indexType}', 'send_deletes', [
+ $this->startNewLog( 'deleting {numIds}
from {indexType}/{elasticType}', 'send_deletes', [
'numIds' => $idCount,
'indexType' => $indexType,
+ 'elasticType' => $elasticType,
] );
- $this->connection->getPageType(
$this->indexBaseName, $indexType )->deleteIds( $docIds );
+ $this->connection
+ ->getIndexType(
$this->indexBaseName, $indexType, $elasticType )
+ ->deleteIds( $docIds );
$this->success();
}
} catch ( \Elastica\Exception\ExceptionInterface $e ) {
diff --git a/includes/Hooks.php b/includes/Hooks.php
index 74b5fcc..d6e15f2 100644
--- a/includes/Hooks.php
+++ b/includes/Hooks.php
@@ -329,12 +329,18 @@
// end up being a no-op when it executes.
$target = $page->getRedirectTarget();
if ( $target ) {
+ $redirId = $page->getTitle()->getArticleID();
+ $redirNs = $page->getTitle()->getNamespace();
+
// DeferredUpdate so we don't end up racing our own
page deletion
- DeferredUpdates::addCallableUpdate( function() use (
$target ) {
+ DeferredUpdates::addCallableUpdate( function() use (
$target, $redirId, $redirNs ) {
JobQueueGroup::singleton()->push(
new Job\LinksUpdate( $target, [
'addedLinks' => [],
'removedLinks' => [],
+ // Used by compsuggest RT
updates
+ 'deletedRedirectNamespace' =>
$redirNs,
+ 'deletedRedirectId' => $redirId,
] )
);
} );
@@ -477,6 +483,7 @@
'removedLinks' => self::prepareTitlesForLinksUpdate(
$linksUpdate->getRemovedLinks(),
$wgCirrusSearchUnlinkedArticlesToUpdate ),
];
+
// Prioritize jobs that are triggered from a web process. This
should prioritize
// single page update jobs over those triggered by template
changes.
if ( PHP_SAPI != 'cli' ) {
diff --git a/includes/Job/CompletionSuggesterDeleteJob.php
b/includes/Job/CompletionSuggesterDeleteJob.php
new file mode 100644
index 0000000..d2aaf34
--- /dev/null
+++ b/includes/Job/CompletionSuggesterDeleteJob.php
@@ -0,0 +1,60 @@
+<?php
+
+namespace CirrusSearch\Job;
+
+use CirrusSearch\Connection;
+use JobQueueGroup;
+use Title;
+
+/**
+ * Performs RT updates to the completion suggester index.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ */
+class CompletionSuggesterDeleteJob extends Job {
+ protected function doJob() {
+ $updater = $this->createUpdater();
+ $docIds = $this->params['docIds'];
+ $compSuggestIds = [];
+ // NOTE: evaluate a solution based on deleteByQuery
+ foreach( array_chunk( $docIds, 2 ) as $chunk ) {
+ // Very small chunks here, the worst case would be a
+ // single doc that generated 1024 cross-NS redirects
+ $compSuggestIds = array_merge(
+ $updater->retrieveCompSuggestIds( $chunk ),
+ $compSuggestIds
+ );
+ }
+ if ( empty( $compSuggestIds ) ) {
+ return true;
+ }
+ foreach( array_chunk( $compSuggestIds, 1024 ) as $chunk ) {
+ $job = new ElasticaWrite(
+ $titles ? reset( $titles ) : Title::makeTitle(
0, "" ),
+ $this->jobParams( [
+ 'method' => 'sendDeletes',
+ 'arguments' => [
+ $chunk,
+ Connection::TITLE_SUGGEST_TYPE,
+
Connection::TITLE_SUGGEST_TYPE_NAME
+ ]
+ ] )
+ );
+ $job->run();
+ }
+ return true;
+ }
+}
diff --git a/includes/Job/CompletionSuggesterUpdateJob.php
b/includes/Job/CompletionSuggesterUpdateJob.php
new file mode 100644
index 0000000..146c215
--- /dev/null
+++ b/includes/Job/CompletionSuggesterUpdateJob.php
@@ -0,0 +1,161 @@
+<?php
+
+namespace CirrusSearch\Job;
+
+use CirrusSearch\BuildDocument\Completion\SuggestBuilder;
+use CirrusSearch\SearchConfig;
+use CirrusSearch\Connection;
+use JobQueueGroup;
+use Title;
+
+/**
+ * Performs RT updates to the completion suggester index.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ */
+class CompletionSuggesterUpdateJob extends Job {
+ /**
+ * Tries to update documents in the completion suggester
+ * The process is as follow:
+ * - Check if the updated title match the namespace handled
+ * - Build new suggestion documents by retrieving the updated
+ * doc from the main index
+ * - Retrieve existing suggestions from the titlesuggest index
+ * - diff both
+ * - Send deletes or updates
+ *
+ * The diff only works by inspecting text data in the suggest fields
+ * it does not consider any other changes (weigths or any other
attributes)
+ *
+ * Ideally this method should be triggered only when the text data is
+ * likely to change:
+ * - redirect added/removed
+ * - new page
+ *
+ */
+ protected function doJob() {
+ $updater = $this->createUpdater();
+ list( $page, $redirects ) = $updater->traceRedirects(
$this->title );
+ if ( !$page ) {
+ // nothing to do
+ return true;
+ }
+ $title = $page->getTitle();
+ $sourceDoc = $updater->retrieveDoc( $title );
+ if ( $sourceDoc === null ) {
+ return true;
+ }
+
+ $indexBaseName = $this->searchConfig->get(
SearchConfig::INDEX_BASE_NAME );
+ $builder = SuggestBuilder::createBuilder( $this->searchConfig );
+ $builder->prepare( $indexBaseName, $this->connection );
+ $newDocs = $builder->build( [ [ 'id' => $sourceDoc->getId(),
'source' => $sourceDoc->getData() ] ] );
+
+ $docIds = array_map(
+ function( $doc ) {
+ $doc->setDocAsUpsert( true );
+ return $doc->getId();
+ },
+ $newDocs
+ );
+
+ // This job was triggered because a redirect was deleted
+ // First evaluate if we need to delete one the suggest doc.
+ // Candidates are:
+ // - a crossNS redirect: has its own suggest doc as a title
suggestion
+ // - the last redirect: the suggest doc holding all the
redirects is
+ // now empty and probably needs to be deleted from the index.
+ if ( isset( $this->params['deletedRedirectNamespace'] ) ) {
+ $redirNs = $this->params['deletedRedirectNamespace'];
+ $redirId = $this->params['deletedRedirectId'];
+ $redirSuggId = SuggestBuilder::encodeDocId(
SuggestBuilder::REDIRECT_SUGGESTION, $sourceDoc->getId() );
+ $crossNsDocId = SuggestBuilder::encodeDocId(
SuggestBuilder::TITLE_SUGGESTION, $redirId );
+ $to_delete = [];
+ if ( $title->getNamespace() !==
$builder->getTargetNamespace()
+ && !in_array( $crossNsDocId, $docIds )
+ ) {
+ // We need to delete the crossNS redirect
+ $to_delete[] = $crossNsDocId;
+ } else if ( !in_array( $redirSuggId, $docIds ) ) {
+ // Check if this was the latest redirect removed
+ // then trigger a delete of the redirect
suggestions
+ $to_delete[] = $redirSuggId;
+ }
+
+ if ( !empty( $to_delete ) ) {
+ $job = new ElasticaWrite(
+ $title,
+ [
+ 'method' => 'sendDeletes',
+ 'arguments' => [
+ $to_delete,
+
Connection::TITLE_SUGGEST_TYPE,
+
Connection::TITLE_SUGGEST_TYPE_NAME
+ ],
+ 'cluster' =>
$this->writeToClusterName
+ ]
+ );
+ $job->run();
+ }
+ }
+
+ if ( $this->searchConfig->getElement(
'CirrusSearchWikimediaExtraPlugin', 'super_detect_noop' ) ) {
+ $newDocs = array_map( [ $this, 'toSuperNoop' ],
$newDocs );
+ }
+
+ foreach( array_chunk( $newDocs, 10 ) as $chunked ) {
+ $job = new ElasticaWrite(
+ $title,
+ [
+ 'method' => 'sendData',
+ 'arguments' => [
+ Connection::TITLE_SUGGEST_TYPE,
+ $chunked,
+
Connection::TITLE_SUGGEST_TYPE_NAME
+ ],
+ 'cluster' => $this->writeToClusterName
+ ]
+ );
+ $job->run();
+ }
+ }
+
+ /**
+ * @param \Elastica\Document $doc
+ * @return \Elastica\Script\Script
+ */
+ private function toSuperNoop( \Elastica\Document $doc ) {
+ $params = $doc->getParams();
+ foreach( $doc->getData() as $k => $v ) {
+ // Hazardous detection of suggest types
+ if ( is_array( $v )
+ && isset( $v['weight'] )
+ && isset( $v["input"] )
+ ) {
+ // We only want to update suggestions
+ // not the weights
+ $params['source'][$k]['input'] = $v["input"];
+ }
+ }
+ // FIXME: need to figure out how to support version here
+ // classic verioning may trigger too many updates
+ $script = new \Elastica\Script\Script( 'super_detect_noop',
$params, 'native' );
+ if ( $doc->getDocAsUpsert() ) {
+ $script->setUpsert( $doc );
+ }
+ return $script;
+ }
+}
diff --git a/includes/Job/Job.php b/includes/Job/Job.php
index 5f9b6e5..358464b 100644
--- a/includes/Job/Job.php
+++ b/includes/Job/Job.php
@@ -52,7 +52,7 @@
* @param array $params
*/
public function __construct( $title, $params ) {
- $params += [ 'cluster' => null ];
+ $params += [ 'cluster' => null, 'maintenance' => false ];
// eg: DeletePages -> cirrusSearchDeletePages
$jobName = 'cirrusSearch' . str_replace( 'CirrusSearch\\Job\\',
'', static::class );
parent::__construct( $jobName, $title, $params );
@@ -138,6 +138,9 @@
*/
protected function createUpdater() {
$flags = [];
+ if ( $this->params['maintenance'] ) {
+ $flags[] = 'maintenance';
+ }
if ( isset( $this->params['cluster'] ) ) {
$flags[] = 'same-cluster';
}
@@ -209,4 +212,11 @@
return $conns;
}
+
+ protected function jobParams( array $params = [] ) {
+ return $params + [
+ 'maintenance' => $this->params['maintenance'],
+ 'cluster' => $this->params['cluster'],
+ ];
+ }
}
diff --git a/includes/Job/LinksUpdate.php b/includes/Job/LinksUpdate.php
index 6507448..39e07da 100644
--- a/includes/Job/LinksUpdate.php
+++ b/includes/Job/LinksUpdate.php
@@ -2,6 +2,7 @@
namespace CirrusSearch\Job;
+use \CirrusSearch\BuildDocument\Completion\SuggestBuilder;
use JobQueueGroup;
use Title;
@@ -47,10 +48,25 @@
$updater = $this->createUpdater();
$res = $updater->updateFromTitle( $this->title );
+
if ( $res === false ) {
// Couldn't update. Bail early and retry rather than
adding an
// IncomingLinkCount job that will produce the wrong
answer.
return $res;
+ }
+
+ if ( $this->isCompletionUpdateNeeded() ) {
+ $jobParams = [];
+ if ( isset( $this->params['deletedRedirectNamespace'] )
) {
+ $jobParams['deletedRedirectNamespace'] =
$this->params['deletedRedirectNamespace'];
+ $jobParams['deletedRedirectId'] =
$this->params['deletedRedirectId'];
+ }
+ $compSuggestJob = new CompletionSuggesterUpdateJob(
+ $this->title,
+ $this->jobParams( $jobParams )
+ );
+ $compSuggestJob->setDelay( 2 *
$wgCirrusSearchRefreshInterval + 1 );
+ JobQueueGroup::singleton()->push( $compSuggestJob );
}
// Queue IncomingLinkCount jobs when pages are newly linked or
unlinked
@@ -61,9 +77,7 @@
if ( !$title ) {
continue;
}
- $linkCount = new IncomingLinkCount( $title, [
- 'cluster' => $this->params['cluster'],
- ] );
+ $linkCount = new IncomingLinkCount( $title,
$this->jobParams() );
// If possible, delay the job execution by a few
seconds so Elasticsearch
// can refresh to contain what we just sent it. The
delay should be long
// enough for Elasticsearch to complete the refresh
cycle, which normally
@@ -78,6 +92,41 @@
}
/**
+ * Checks wether this job must trigger an update on the completion
+ * suggester index.
+ *
+ * @return bool true if an update to completion suggester index is
+ * needed
+ */
+ public function isCompletionUpdateNeeded() {
+ if ( !$this->searchConfig->getElement(
'CirrusSearchWikimediaExtraPlugin', 'super_detect_noop' ) ) {
+ // Without super_noop we will certainly too many updates
+ return false;
+ }
+ if ( $this->params['maintenance'] ) {
+ // No RT updates during maintenance
+ return false;
+ }
+
+ if ( $this->searchConfig->get(
'CirrusSearchUseCompletionSuggester' ) === 'no' ) {
+ return false;
+ }
+
+ // Ideally we'd like to know if this update
+ // was triggered because:
+ // - this is a new page
+ // - a new redirect was added
+ $builder = SuggestBuilder::createBuilder( $this->searchConfig );
+ if ( isset( $this->params['deletedRedirectNamespace'] ) &&
+ $builder->getTargetNamespace() ===
$this->params['deletedRedirectNamespace']
+ ) {
+ return true;
+ }
+
+ return $builder->acceptRTUpdate( $this->title );
+ }
+
+ /**
* @return bool Is this job prioritized?
*/
public function isPrioritized() {
diff --git a/includes/Job/MassIndex.php b/includes/Job/MassIndex.php
index dc11908..664ae8b 100644
--- a/includes/Job/MassIndex.php
+++ b/includes/Job/MassIndex.php
@@ -40,11 +40,11 @@
}
// We don't have a "title" for this job so we use the Main Page
because it exists.
- return new self( Title::newMainPage(), [
+ return new self( Title::newMainPage(), $this->jobParams( [
'pageDBKeys' => $pageDBKeys,
'updateFlags' => $updateFlags,
'cluster' => $cluster,
- ] );
+ ] ) );
}
/**
diff --git a/includes/Job/OtherIndex.php b/includes/Job/OtherIndex.php
index d69a562..865aa42 100644
--- a/includes/Job/OtherIndex.php
+++ b/includes/Job/OtherIndex.php
@@ -29,10 +29,10 @@
* Check if we need to make a job and inject one if so.
*
* @param Title[] $titles The title we might update
- * @param string|null $cluster The name of the cluster to write
+ * @param mixed[] $jobParams
* to, or null for all clusters.
*/
- public static function queueIfRequired( array $titles, $cluster ) {
+ public static function queueIfRequired( array $titles, array $jobParams
= [] ) {
$titlesToUpdate = [];
foreach( $titles as $title ) {
if ( OtherIndexes::getExternalIndexes( $title ) ) {
@@ -43,9 +43,8 @@
// Note that we're updating a bunch of titles but we
have to pick one to
// attach to the job so we pick the first one.
JobQueueGroup::singleton()->push(
- new self( $titles[ 0 ], [
+ new self( $titles[ 0 ], $jobParams + [
'titles' => $titlesToUpdate,
- 'cluster' => $cluster,
] )
);
}
diff --git a/includes/OtherIndexes.php b/includes/OtherIndexes.php
index fe97f74..58a2b04 100644
--- a/includes/OtherIndexes.php
+++ b/includes/OtherIndexes.php
@@ -150,12 +150,12 @@
// These are split into a job per index so one index
// being frozen doesn't block updates to other indexes
// in the same update.
+ $updater = $this->createUpdater();
foreach ( $updates as $indexName => $actions ) {
- $job = new Job\ElasticaWrite( reset( $titles ), [
+ $job = new Job\ElasticaWrite( reset( $titles ),
$updates->jobParams( [
'method' => 'sendOtherIndexUpdates',
'arguments' => [ $this->localSite, $indexName,
$actions ],
- 'cluster' => $this->writeToClusterName,
- ] );
+ ] ) );
$job->run();
}
}
diff --git a/includes/Sanity/QueueingRemediator.php
b/includes/Sanity/QueueingRemediator.php
index 10df29d..c56c8ca 100644
--- a/includes/Sanity/QueueingRemediator.php
+++ b/includes/Sanity/QueueingRemediator.php
@@ -54,10 +54,9 @@
*/
public function ghostPageInIndex( $docId, Title $title ) {
JobQueueGroup::singleton()->push(
- new DeletePages( $title, [
+ new DeletePages( $title, $this->jobParams( [
'docId' => $docId,
- 'cluster' => $this->cluster,
- ] )
+ ] ) )
);
}
@@ -68,11 +67,10 @@
*/
public function pageInWrongIndex( $docId, WikiPage $page, $wrongIndex )
{
JobQueueGroup::singleton()->push(
- new DeletePages( $page->getTitle(), [
+ new DeletePages( $page->getTitle(), $this->jobParams( [
'indexType' => $wrongIndex,
'docId' => $docId,
- 'cluster' => $this->cluster,
- ] )
+ ] ) )
);
$this->pushLinksUpdateJob( $page );
}
@@ -88,11 +86,21 @@
private function pushLinksUpdateJob( WikiPage $page ) {
JobQueueGroup::singleton()->push(
- new LinksUpdate( $page->getTitle(), [
+ new LinksUpdate( $page->getTitle(), $this->jobParams( [
'addedLinks' => [],
'removedLinks' => [],
- 'cluster' => $this->cluster,
- ] )
+ ] ) )
);
}
+
+ /**
+ * @param mixed[] $params
+ * @return mixed[]
+ */
+ private function jobParams( array $params = [] ) {
+ return $params + [
+ 'cluster' => $this->cluster,
+ 'maintenance' => true,
+ ];
+ }
}
diff --git a/includes/Updater.php b/includes/Updater.php
index 0b20675..01898f0 100644
--- a/includes/Updater.php
+++ b/includes/Updater.php
@@ -2,6 +2,7 @@
namespace CirrusSearch;
+use CirrusSearch\BuildDocument\Completion\SuggestBuilder;
use Hooks as MWHooks;
use MediaWiki\Logger\LoggerFactory;
use ParserCache;
@@ -54,6 +55,19 @@
*/
protected $searchConfig;
+ /** @var \Psr\Log\LoggerInterface */
+ private $log;
+
+ /**
+ * @var string
+ */
+ private $indexBaseName;
+
+ /**
+ * @var bool true if running from a maintenance script
+ */
+ private $maintUpdates = false;
+
/**
* @param Connection $conn
* @param SearchConfig $config
@@ -62,8 +76,14 @@
public function __construct( Connection $conn, SearchConfig $config,
array $flags = [] ) {
parent::__construct( $conn, null, 0 );
$this->searchConfig = $config;
+ $this->indexBaseName = $config->get(
SearchConfig::INDEX_BASE_NAME );
+ $this->log = LoggerFactory::getInstance( 'CirrusSearch' );
if ( in_array( 'same-cluster', $flags ) ) {
$this->writeToClusterName =
$this->connection->getClusterName();
+ }
+
+ if ( in_array( 'maintenance', $flags ) ) {
+ $this->maintUpdates = true;
}
}
@@ -124,9 +144,8 @@
}
$page = WikiPage::factory( $title );
- $logger = LoggerFactory::getInstance( 'CirrusSearch' );
if ( !$page->exists() ) {
- $logger->debug( "Ignoring an update for a
nonexistent page: $titleText" );
+ $this->log->debug( "Ignoring an update for a
nonexistent page: $titleText" );
return [ null, $redirects ];
}
$content = $page->getContent();
@@ -145,7 +164,7 @@
$target = $content->getUltimateRedirectTarget();
if ( $target->equals( $page->getTitle() ) ) {
// This doesn't warn about redirect
loops longer than one but we'll catch those anyway.
- $logger->info( "Title redirecting to
itself. Skip indexing" );
+ $this->log->info( "Title redirecting to
itself. Skip indexing" );
return [ null, $redirects ];
}
$title = $target;
@@ -153,6 +172,105 @@
} else {
return [ $page, $redirects ];
}
+ }
+ }
+
+ /**
+ * Retrieve existing docs from the completion suggester
+ * that issued from $title
+ *
+ * @param Title $title
+ * @return \Elastica\Document[]
+ * @throws \RuntimeException
+ */
+ public function retrieveCompSuggestPagesFromTitle( Title $title ) {
+ $docId = $this->searchConfig->makeId( $title->getArticleID() );
+ return $this->retrieveCompSuggestPages( [ $docId ] );
+ }
+
+ /**
+ * @return \Elastica\Document[]
+ */
+ public function retrieveCompSuggestPages( array $docIds ) {
+ $docs = [];
+ foreach( $this->searchCompSuggest( $docIds, true ) as $d ) {
+ $docs[] = new \Elastica\Document(
+ $d->getId(),
+ $d->getSource()
+ );
+ }
+ return $docs;
+ }
+
+ /**
+ * @return string[] $docIds
+ */
+ public function retrieveCompSuggestIds( array $docIds ) {
+ $ids = [];
+ foreach( $this->searchCompSuggest( $docIds, false ) as $d ) {
+ $ids[] = $d->getId();
+ }
+ return $ids;
+ }
+
+ /**
+ * @param string[] $docIds
+ * @param bool $withSource fetch the source
+ * @return \Elastica\ResultSet
+ */
+ private function searchCompSuggest( array $docIds, $withSource = false
) {
+ $suggestType = $this->connection->getIndexType(
+ $this->indexBaseName,
+ Connection::TITLE_SUGGEST_TYPE,
+ Connection::TITLE_SUGGEST_TYPE_NAME
+ );
+ $builder = SuggestBuilder::createBuilder( $this->searchConfig );
+ try {
+ $search = new \Elastica\Search(
$this->connection->getClient() );
+ $search->addIndex( $suggestType->getIndex() );
+ $search->addType( $suggestType );
+ $query = $builder->buildSuggestRetrievalQuery( $docIds,
$withSource );
+ $search->setQuery( $query );
+ $this->start( $this->newLog(
+ 'Get suggest source docs for CompSuggest RT
updates',
+ 'compsuggest_rt_retrieve'
+ ) );
+ $results = $search->search();
+ $this->success( $results );
+ return $results;
+ } catch( \Elastica\Exception\ExceptionInterface $exc ) {
+ $this->failure( $exc );
+ throw new \RuntimeException( "Cannot retrieve suggest
doc", $exc );
+ }
+ }
+
+ /**
+ * Retrieve document from the main index
+ * @param Title $title
+ * @return \Elastica\Document|null the document or null if not found
+ * @throws \RuntimeException
+ */
+ public function retrieveDoc( Title $title ) {
+ $docId = $this->searchConfig->makeId( $title->getArticleID() );
+ $type = $this->connection->getPageType(
+ $this->indexBaseName,
+ $this->connection->getIndexSuffixForNamespace(
+ $title->getNamespace()
+ ));
+ try {
+ $this->start( $this->newLog(
+ 'Get source docs for RT updates',
+ 'updater_get'
+ ) );
+ $doc = $type->getDocument( $docId );
+ $this->success();
+ return $doc;
+ } catch( \Elastica\Exception\NotFoundException $nfe ) {
+ $this->success();
+ return null;
+ } catch( \Elastica\Exception\ExceptionInterface $exc ) {
+ $this->failure( $exc );
+ throw new \RuntimeException( "Cannot fetch doc $docId",
$exc );
}
}
@@ -192,7 +310,7 @@
} );
$titles = $this->pagesToTitles( $pages );
- Job\OtherIndex::queueIfRequired( $titles,
$this->writeToClusterName );
+ Job\OtherIndex::queueIfRequired( $titles, $this->jobParams() );
$allData = array_fill_keys(
$this->connection->getAllIndexTypes(), [] );
foreach ( $this->buildDocumentsForPages( $pages, $flags ) as
$document ) {
@@ -211,11 +329,10 @@
foreach( array_chunk( $data, 10 ) as $chunked ) {
$job = new Job\ElasticaWrite(
reset( $titles ),
- [
+ $this->jobParams( [
'method' => 'sendData',
'arguments' => [ $indexType,
$chunked ],
- 'cluster' =>
$this->writeToClusterName,
- ]
+ ] )
);
// This job type will insert itself into the
job queue
// with a delay if writes to ES are currently
unavailable
@@ -233,23 +350,46 @@
*
* @param Title[] $titles List of titles to delete. If empty then
skipped other index
* maintenance is skipped.
- * @param integer[] $docIds List of elasticsearch document ids to delete
- * @param string $indexType index from which to delete
- * @return bool True if nothing happened or we successfully deleted,
false on failure
+ * @param string[] $docIds List of elasticsearch document ids to delete
+ * @param string|null $indexType index from which to delete
*/
public function deletePages( $titles, $docIds, $indexType = null ) {
- Job\OtherIndex::queueIfRequired( $titles,
$this->writeToClusterName );
+ Job\OtherIndex::queueIfRequired( $titles, $this->jobParams() );
$job = new Job\ElasticaWrite(
$titles ? reset( $titles ) : Title::makeTitle( 0, "" ),
- [
+ $this->jobParams( [
'method' => 'sendDeletes',
'arguments' => [ $docIds, $indexType ],
- 'cluster' => $this->writeToClusterName,
- ]
+ ] )
);
// This job type will insert itself into the job queue
// with a delay if writes to ES are currently paused
$job->run();
+
+ $this->deleteCompSuggestPages( $titles, $docIds, $indexType );
+ }
+
+ /**
+ * Delete pages from the titlesuggest index if realtime updates are
enabled
+ * @param Title[] $titles List of titles to delete.
+ * @param string[] $docIds List of elasticsearch document ids to delete
+ */
+ public function deleteCompSuggestPages( array $titles, array $docIds ) {
+ if ( $this->searchConfig->get(
'CirrusSearchUseCompletionSuggester' ) === 'no' ) {
+ return;
+ }
+ if ( $this->searchConfig->getElement(
'CirrusSearchCompletionSuggesterRTOptions', 'realtime_updates' ) !== true
+ && $this->maintUpdates
+ ) {
+ return;
+ }
+ $job = new Job\CompletionSuggesterDeleteJob(
+ $titles ? reset( $titles ) : Title::makeTitle( 0, "" ),
+ $this->jobParams( [
+ 'docIds' => $docIds,
+ ] )
+ );
+ \JobQueueGroup::singleton()->push( $job );
}
/**
@@ -271,7 +411,7 @@
foreach ( $pages as $page ) {
$title = $page->getTitle();
if ( !$page->exists() ) {
- LoggerFactory::getInstance( 'CirrusSearch'
)->warning(
+ $this->log->warning(
'Attempted to build a document for a
page that doesn\'t exist. This should be caught ' .
"earlier but wasn't. Page: {title}",
[ 'title' => $title ]
@@ -427,4 +567,17 @@
$extra
);
}
+
+ /**
+ * Prepare job params by setting some generic params
+ * such as 'maintenance' or 'cluster'
+ * @param mixed[] $params specific params
+ * @return mixed[] specific params plus generic parameters
+ */
+ public function jobParams( array $params = [] ) {
+ return $params + [
+ 'maintenance' => $this->maintUpdates,
+ 'cluster' => $this->writeToClusterName,
+ ];
+ }
}
diff --git a/maintenance/forceSearchIndex.php b/maintenance/forceSearchIndex.php
index 2d3563a..9aa0a38 100644
--- a/maintenance/forceSearchIndex.php
+++ b/maintenance/forceSearchIndex.php
@@ -579,7 +579,7 @@
* @return Updater
*/
private function createUpdater() {
- $flags = [];
+ $flags = [ 'maintenance' ];
if ( $this->hasOption( 'cluster' ) ) {
$flags[] = 'same-cluster';
}
diff --git a/maintenance/updateSuggesterIndex.php
b/maintenance/updateSuggesterIndex.php
index da329b9..6ac2308 100644
--- a/maintenance/updateSuggesterIndex.php
+++ b/maintenance/updateSuggesterIndex.php
@@ -8,11 +8,8 @@
use CirrusSearch\BuildDocument\Completion\DefaultSortSuggestionsBuilder;
use CirrusSearch\BuildDocument\Completion\NaiveSubphrasesSuggestionsBuilder;
use CirrusSearch\BuildDocument\Completion\SuggestBuilder;
-use CirrusSearch\BuildDocument\Completion\SuggestScoringMethodFactory;
-use CirrusSearch\BuildDocument\Completion\SuggestScoringMethod;
use CirrusSearch\Maintenance\Validators\AnalyzersValidator;
use CirrusSearch\SearchConfig;
-use CirrusSearch\Elastica\MultiSearch as MultiSearch;
use Elastica;
use Elastica\Index;
use Elastica\Query;
@@ -84,11 +81,6 @@
private $scoreMethodName;
/**
- * @var SuggestScoringMethod the score function to use.
- */
- private $scoreMethod;
-
- /**
* @var Index old suggester index that will be deleted at the end of
the process
*/
private $oldIndex;
@@ -141,6 +133,21 @@
private $recycle = false;
/**
+ * @var bool
+ */
+ private $realtimeUpdates = false;
+
+ /**
+ * @var string refresh interval for realtime updates
+ */
+ private $refreshInterval = '-1';
+
+ /**
+ * @var DataSender
+ */
+ private $dataSender;
+
+ /**
* @var string[]
*/
private $bannedPlugins;
@@ -178,11 +185,23 @@
$wgCirrusSearchBannedPlugins,
$wgCirrusSearchMasterTimeout,
$wgCirrusSearchMaxShardsPerNode,
- $wgCirrusSearchCompletionDefaultScore;
+ $wgCirrusSearchCompletionDefaultScore,
+ $wgCirrusSearchCompletionSuggesterRTOptions;
+ $this->realtimeUpdates = isset(
$wgCirrusSearchCompletionSuggesterRTOptions['realtime_updates'] ) &&
+
$wgCirrusSearchCompletionSuggesterRTOptions['realtime_updates'] === true;
+
+
+ if ( $this->realtimeUpdates ) {
+ if ( !isset(
$wgCirrusSearchCompletionSuggesterRTOptions['refresh_interval'] ) ) {
+ $this->error( 'refresh_interval must be set on
$wgCirrusSearchCompletionSuggesterRTOptions in order to enable
realtime_updates', 1 );
+ }
+ $this->refreshInterval =
$wgCirrusSearchCompletionSuggesterRTOptions['refresh_interval'];
+ }
$this->disablePoolCountersAndLogging();
$this->masterTimeout = $this->getOption( 'masterTimeout',
$wgCirrusSearchMasterTimeout );
$this->indexTypeName = Connection::TITLE_SUGGEST_TYPE;
+
// Check that all shards and replicas settings are set
try {
@@ -198,6 +217,7 @@
$this->optimizeIndex = $this->getOption( 'optimize', false );
+ $this->dataSender = new DataSender( $this->getConnection(),
$this->getSearchConfig() );
$this->utils = new ConfigUtils( $this->getClient(), $this);
$this->langCode = $wgLanguageCode;
@@ -211,17 +231,7 @@
$this->maxShardsPerNode = isset(
$wgCirrusSearchMaxShardsPerNode[ $this->indexTypeName ] ) ?
$wgCirrusSearchMaxShardsPerNode[ $this->indexTypeName ] : 'unlimited';
$this->scoreMethodName = $this->getOption( 'scoringMethod',
$wgCirrusSearchCompletionDefaultScore );
- $this->scoreMethod =
SuggestScoringMethodFactory::getScoringMethod( $this->scoreMethodName );
-
- $extraBuilders = [];
- if( $this->getSearchConfig()->get(
'CirrusSearchCompletionSuggesterUseDefaultSort' ) ) {
- $extraBuilders[] = new DefaultSortSuggestionsBuilder();
- }
- $subPhrasesConfig = $this->getSearchConfig()->get(
'CirrusSearchCompletionSuggesterSubphrases' );
- if( $subPhrasesConfig['build'] ) {
- $extraBuilders[] =
NaiveSubphrasesSuggestionsBuilder::create( $subPhrasesConfig );
- }
- $this->builder = new SuggestBuilder( $this->scoreMethod,
$extraBuilders );
+ $this->builder = SuggestBuilder::createBuilder(
$this->getSearchConfig(), $this->scoreMethodName );
try {
// If the version does not exist it's certainly because
nothing has been indexed.
@@ -236,11 +246,13 @@
# check for broken indices and delete them
$this->checkAndDeleteBrokenIndices();
+ $this->freezeWrites();
if ( !$this->canRecycle() ) {
$this->rebuild();
} else {
$this->recycle();
}
+ $this->resumeWrites();
} catch ( \Elastica\Exception\Connection\HttpException $e ) {
$message = $e->getMessage();
$this->log( "\nUnexpected Elasticsearch failure.\n" );
@@ -263,9 +275,7 @@
* @return true if the cluster/index is not frozen, false otherwise.
*/
private function canWrite() {
- // Reuse DataSender even if we don't send anything with it.
- $sender = new DataSender( $this->getConnection(),
$this->getSearchConfig() );
- return $sender->areIndexesAvailableForWrites( [
$this->getIndexTypeName() ] );
+ return $this->dataSender->areIndexesAvailableForWrites( [
Connection::TITLE_SUGGEST_TYPE ] );
}
/**
@@ -332,11 +342,6 @@
$this->error( 'Index does not exist yet cannot
recycle.' );
return false;
}
- $refresh = $oldIndex->getSettings()->getRefreshInterval();
- if ( $refresh != '-1' ) {
- $this->error( 'Refresh interval is not -1, cannot
recycle.' );
- return false;
- }
$shards = $oldIndex->getSettings()->get( 'number_of_shards' );
// We check only the number of shards since it cannot be
updated.
@@ -398,6 +403,7 @@
private function recycle() {
$this->log( "Recycling index {$this->getIndex()->getName()}\n");
$this->recycle = true;
+ $this->disableRefreshInterval();
$this->indexData();
// This is fragile... hopefully most of the docs will be
deleted from the old segments
// and will result in a fast operation.
@@ -460,6 +466,7 @@
// Refresh the reader so it now uses the optimized FST,
// and actually free and delete old segments.
$this->getIndex()->refresh();
+ $this->enableRefreshInterval();
}
private function deleteOldIndex() {
@@ -507,47 +514,10 @@
private function indexData() {
// We build the suggestions by reading CONTENT and GENERAL
indices.
// This does not support extra indices like FILES on commons.
- $sourceIndexTypes = [ Connection::CONTENT_INDEX_TYPE,
Connection::GENERAL_INDEX_TYPE ];
+ $this->builder->prepare( $this->indexBaseName,
$this->getConnection() );
- // Indices to use for counting max_docs used by scoring
functions
- // Since we work mostly on the content namespace it seems OK to
count
- // only docs in the CONTENT index.
- $countIndices = [ Connection::CONTENT_INDEX_TYPE ];
-
- $query = new Query();
- $query->setSource( [
- 'includes' => $this->builder->getRequiredFields()
- ] );
-
- $pageAndNs = new Elastica\Query\BoolQuery();
- $pageAndNs->addShould( new Elastica\Query\Term( [ "namespace"
=> NS_MAIN ] ) );
- $pageAndNs->addShould( new Elastica\Query\Term( [
"redirect.namespace" => NS_MAIN ] ) );
- $pageAndNs->addMust( new Elastica\Query\Type(
Connection::PAGE_TYPE_NAME ) );
- $bool = new Elastica\Query\BoolQuery();
- $bool->addFilter( $pageAndNs );
-
- $query->setQuery( $bool );
-
- // Run a first query to count the number of docs.
- // This is needed for the scoring methods that need
- // to normalize values against wiki size.
- $mSearch = new MultiSearch( $this->getClient() );
- foreach ( $countIndices as $sourceIndexType ) {
- $search = new \Elastica\Search( $this->getClient() );
- $search->addIndex( $this->getConnection()->getIndex(
$this->indexBaseName, $sourceIndexType ) );
- $search->getQuery()->setSize( 0 );
- $mSearch->addSearch( $search );
- }
-
- $mSearchRes = $mSearch->search();
- $total = 0;
- foreach( $mSearchRes as $res ) {
- $total += $res->getTotalHits();
- }
- $this->log( "Setting max_docs to $total\n" );
- $this->scoreMethod->setMaxDocs( $total );
-
- foreach( $sourceIndexTypes as $sourceIndexType ) {
+ $query = $this->builder->buildSourceQuery();
+ foreach( $this->builder->getSourceIndexTypes() as
$sourceIndexType ) {
$sourceIndex = $this->getConnection()->getIndex(
$this->indexBaseName, $sourceIndexType );
$search = new \Elastica\Search( $this->getClient() );
$search->setQuery( $query );
@@ -706,7 +676,8 @@
$path,
Request::PUT,
$args,
- [ 'master_timeout' => $this->masterTimeout ]
+ // master_timeout does not seem to be accepted here.
+ [ /* 'master_timeout' => $this->masterTimeout */ ]
);
// The previous call seems to be async, let's wait few sec
@@ -785,6 +756,38 @@
protected function getIndexTypeName() {
return $this->getConnection()->getIndexName(
$this->indexBaseName, $this->indexTypeName );
}
+
+ private function disableRefreshInterval() {
+ $this->log( "Disabling refresh interval\n" );
+ $this->getIndex()->request(
+ '_settings',
+ Request::PUT,
+ [ 'index' => [ 'refresh_interval' => '-1' ] ],
+ [ 'master_timeout' => $this->masterTimeout ]
+ );
+ }
+
+ private function enableRefreshInterval() {
+ if ( $this->getIndex()->getSettings()->getRefreshInterval() !=
$this->refreshInterval ) {
+ $this->log( "Setting refresh_interval to " .
$this->refreshInterval . "\n");
+ $this->getIndex()->request(
+ '_settings',
+ Request::PUT,
+ [ 'index' => [ 'refresh_interval' =>
$this->refreshInterval ] ],
+ [ 'master_timeout' => $this->masterTimeout ]
+ );
+ }
+ }
+
+ private function freezeWrites() {
+ $this->log( "Freezing writes during update\n" );
+ $this->dataSender->freezeIndexes( [
Connection::TITLE_SUGGEST_TYPE ] );
+ }
+
+ private function resumeWrites() {
+ $this->log( "Resuming writes\n" );
+ $this->dataSender->thawIndexes( [
Connection::TITLE_SUGGEST_TYPE ] );
+ }
}
$maintClass = UpdateSuggesterIndex::class;
diff --git a/tests/browser/features/step_definitions/search_steps.rb
b/tests/browser/features/step_definitions/search_steps.rb
index 9e16a59..fc7fedb 100644
--- a/tests/browser/features/step_definitions/search_steps.rb
+++ b/tests/browser/features/step_definitions/search_steps.rb
@@ -471,6 +471,20 @@
step("#{title} is the first search result")
end
end
+Then(/^within (\d+) seconds getting api suggestions for (.*?)(?: using the
(.*) profile)? yields (.*) as the first result$/) do |seconds, term, profile,
title|
+ repeat_within(seconds) do
+ step("I get api suggestions for #{term}") unless profile
+ step("I get api suggestions for #{term} using the #{profile} profile") if
profile
+ step("#{title} is the first api suggestion")
+ end
+end
+Then(/^within (\d+) seconds getting api suggestions for (.*?)(?: using the
(.*) profile)? yields no results$/) do |seconds, term, profile|
+ repeat_within(seconds) do
+ step("I get api suggestions for #{term}") unless profile
+ step("I get api suggestions for #{term} using the #{profile} profile") if
profile
+ step("the API should produce empty list")
+ end
+end
Then(/^within (\d+) seconds api searching for (.*) yields (.*?) as the first
result(?: and (.*?) as the second result)?$/) do |seconds, term, title, title2|
repeat_within(seconds) do
step("I api search for " + term)
diff --git a/tests/browser/features/suggest_api.feature
b/tests/browser/features/suggest_api.feature
index c837ac2..7b2df0a 100644
--- a/tests/browser/features/suggest_api.feature
+++ b/tests/browser/features/suggest_api.feature
@@ -93,3 +93,31 @@
Scenario: Default sort can be used as search input
When I ask suggestion API for Wilson
Then the API should produce list starting with Sam Wilson
+
+ Scenario: Realtime updates
+ Given a page named CompSuggest Realtime Updates exists with contents
compsuggest
+ And within 20 seconds getting api suggestions for CompSuggest yields
CompSuggest Realtime Updates as the first result
+ And I delete CompSuggest Realtime Updates
+ And within 20 seconds getting api suggestions for CompSuggest yields no
results
+ And a page named CompSuggest Realtime Updates exists with contents
compsuggest
+ And a page named CompSuggest Realtime Update exists with contents
#REDIRECT [[CompSuggest Realtime Updates]]
+ And a page named RedirCompSuggest exists with contents #REDIRECT
[[CompSuggest Realtime Updates]]
+ And within 20 seconds getting api suggestions for RedirComp yields
RedirCompSuggest as the first result
+ And I delete RedirCompSuggest
+ And within 20 seconds getting api suggestions for RedirComp yields no
results
+ And I delete CompSuggest Realtime Updates
+ And within 20 seconds getting api suggestions for CompSuggest yields no
results
+ And a page named Help:MyHelp exists with contents helpage
+ And a page named MyHelpCompSuggest exists with contents #REDIRECT
[[Help:MyHelp]]
+ And within 20 seconds getting api suggestions for MyHelp yields
MyHelpCompSuggest as the first result
+ And I delete MyHelpCompSuggest
+ And within 20 seconds getting api suggestions for MyHelp yields no results
+ And a page named MyHelpCompSuggest exists with contents #REDIRECT
[[Help:MyHelp]]
+ And within 20 seconds getting api suggestions for MyHelp yields
MyHelpCompSuggest as the first result
+ And I delete Help:MyHelp
+ And within 20 seconds getting api suggestions for MyHelp yields no results
+
+
+
+
+
diff --git a/tests/browser/features/support/hooks.rb
b/tests/browser/features/support/hooks.rb
index 79820fb..f3d3673 100644
--- a/tests/browser/features/support/hooks.rb
+++ b/tests/browser/features/support/hooks.rb
@@ -676,6 +676,11 @@
And a page named The Doors exists with contents The Doors were an
American rock band formed in 1965 in Los Angeles.
And a page named Hyperion Cantos/Endymion exists with contents
Endymion is the third science fiction novel by Dan Simmons.
And a page named はーい exists with contents makes sure we do not fail to
index empty tokens (T156234).
+ And I delete CompSuggest Realtime Updates
+ And I delete CompSuggest Realtime Update
+ And I delete RedirCompSuggest
+ And I delete Help:MyHelp
+ And I delete MyHelpCompSuggest
And I wait 5 seconds
And I reindex suggestions
)
diff --git a/tests/jenkins/FullyFeaturedConfig.php
b/tests/jenkins/FullyFeaturedConfig.php
index 57104ed..f67c3cf 100644
--- a/tests/jenkins/FullyFeaturedConfig.php
+++ b/tests/jenkins/FullyFeaturedConfig.php
@@ -52,6 +52,11 @@
'use' => true,
);
+$wgCirrusSearchCompletionSuggesterRTOptions = [
+ 'realtime_updates' => true,
+ 'refresh_interval' => '1s',
+];
+
// Set defaults to BM25 and the new query builder
$wgCirrusSearchSimilarityProfile = 'bm25_browser_tests';
$wgCirrusSearchFullTextQueryBuilderProfile = 'browser_tests';
diff --git a/tests/unit/SuggestBuilderTest.php
b/tests/unit/SuggestBuilderTest.php
index 0cfdcbd..c1e548a 100644
--- a/tests/unit/SuggestBuilderTest.php
+++ b/tests/unit/SuggestBuilderTest.php
@@ -577,6 +577,222 @@
];
}
+ /**
+ * @dataProvider provideDiffs
+ */
+ public function testDiff( $oldDoc, $newDoc, $output ) {
+ $builder = new SuggestBuilder(
SuggestScoringMethodFactory::getScoringMethod( 'incomingLinks' ), [] );
+
+ $diff = $builder->diff(
+ $builder->build( [ $newDoc ] ),
+ $builder->build( [ $oldDoc ] )
+ );
+ $this->assertEquals(
+ $output['to_delete'],
+ array_map(
+ function ( $x ) {
+ return $x->getId();
+ },
+ $diff['to_delete']
+ ),
+ 'to_delete diff result is correct'
+ );
+ $this->assertEquals(
+ $output['to_update'],
+ array_map(
+ function ( $x ) {
+ return $x->getId();
+ },
+ $diff['to_update']
+ ),
+ 'to_update diff result is correct'
+ );
+ }
+
+ public function provideDiffs() {
+ return [
+ 'simple no diff' => [
+ [
+ 'id' => 123,
+ 'source' => [
+ 'title' => 'Albert Einstein',
+ 'namespace' => 0,
+ 'redirect' => [
+ [ 'title' =>
"Einstein", 'namespace' => 0 ],
+ [ 'title' => "Bert
Einstein", 'namespace' => 0 ],
+ ],
+ 'incoming_links' => 1
+ ]
+ ],
+ [
+ 'id' => 123,
+ 'source' => [
+ 'title' => 'Albert Einstein',
+ 'namespace' => 0,
+ 'redirect' => [
+ [ 'title' => "Bert
Einstein", 'namespace' => 0 ],
+ [ 'title' =>
"Einstein", 'namespace' => 0 ],
+ ],
+ 'incoming_links' => 123
+ ]
+ ],
+ [
+ 'to_delete' => [],
+ 'to_update' => [],
+ ]
+ ],
+ 'simple new redirect' => [
+ [
+ 'id' => 123,
+ 'source' => [
+ 'title' => 'Albert Einstein',
+ 'namespace' => 0,
+ 'redirect' => [
+ [ 'title' =>
"Einstein", 'namespace' => 0 ],
+ ],
+ 'incoming_links' => 1
+ ],
+ ],
+ [
+ 'id' => 123,
+ 'source' => [
+ 'title' => 'Albert Einstein',
+ 'namespace' => 0,
+ 'redirect' => [
+ [ 'title' =>
"Einstein", 'namespace' => 0 ],
+ [ 'title' =>
"Einstein2", 'namespace' => 0 ],
+ ],
+ 'incoming_links' => 123
+ ]
+ ],
+ [
+ 'to_delete' => [],
+ 'to_update' => ['123r'],
+ ]
+ ],
+ 'all redirect removed' => [
+ [
+ 'id' => 123,
+ 'source' => [
+ 'title' => 'Albert Einstein',
+ 'namespace' => 0,
+ 'redirect' => [
+ [ 'title' =>
"Einstein", 'namespace' => 0 ],
+ ],
+ 'incoming_links' => 1
+ ]
+ ],
+ [
+ 'id' => 123,
+ 'source' => [
+ 'id' => 123,
+ 'title' => 'Albert Einstein',
+ 'namespace' => 0,
+ 'redirect' => [
+ ],
+ 'incoming_links' => 123
+ ]
+ ],
+ [
+ 'to_delete' => ['123r'],
+ 'to_update' => [],
+ ]
+ ],
+ 'redirects removed + new close redirect' => [
+ [
+ 'id' => 123,
+ 'source' => [
+ 'id' => 123,
+ 'title' => 'Albert Einstein',
+ 'namespace' => 0,
+ 'redirect' => [
+ [ 'title' =>
"Einstein", 'namespace' => 0 ],
+ ],
+ 'incoming_links' => 1
+ ]
+ ],
+ [
+ 'id' => 123,
+ 'source' => [
+ 'id' => 123,
+ 'title' => 'Albert Einstein',
+ 'namespace' => 0,
+ 'redirect' => [
+ [ 'title' => "Albert
Enstein", 'namespace' => 0 ],
+ ],
+ 'incoming_links' => 123
+ ],
+ ],
+ [
+ 'to_delete' => ['123r'],
+ 'to_update' => ['123t'],
+ ]
+ ],
+ 'CrossNS redirect added' => [
+ [
+ 'id' => 123,
+ 'source' => [
+ 'id' => 123,
+ 'title' => 'Albert Einstein',
+ 'namespace' => 1,
+ 'redirect' => [
+ [ 'title' =>
"Einstein", 'namespace' => 1 ],
+ ],
+ 'incoming_links' => 1
+ ]
+ ],
+ [
+ 'id' => 123,
+ 'source' => [
+ 'id' => 123,
+ 'title' => 'Albert Einstein',
+ 'namespace' => 1,
+ 'redirect' => [
+ [ 'title' =>
"Einstein", 'namespace' => 1 ],
+ [ 'title' => "Bert",
'namespace' => 0 ],
+ ],
+ 'incoming_links' => 1
+ ]
+ ],
+ [
+ 'to_delete' => [],
+ 'to_update' => ['0t'],
+ ]
+ ],
+ 'CrossNS redirect removed' => [
+ [
+ 'id' => 123,
+ 'source' => [
+ 'id' => 123,
+ 'title' => 'Albert Einstein',
+ 'namespace' => 1,
+ 'redirect' => [
+ [ 'title' =>
"Einstein", 'namespace' => 1 ],
+ [ 'title' => "Bert",
'namespace' => 0 ],
+ ],
+ 'incoming_links' => 1
+ ]
+ ],
+ [
+ 'id' => 123,
+ 'source' => [
+ 'id' => 123,
+ 'title' => 'Albert Einstein',
+ 'namespace' => 1,
+ 'redirect' => [
+ [ 'title' =>
"Einstein", 'namespace' => 1 ],
+ ],
+ 'incoming_links' => 1
+ ]
+ ],
+ [
+ 'to_delete' => ['0t'],
+ 'to_update' => [],
+ ]
+ ]
+ ];
+ }
+
private function buildBuilder( $scoringMethod ) {
$extra = [
new DefaultSortSuggestionsBuilder(),
--
To view, visit https://gerrit.wikimedia.org/r/342820
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I2ceaa065b51d79690f03f123fd1771e500ffb93f
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: DCausse <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits