Manybubbles has uploaded a new change for review.
https://gerrit.wikimedia.org/r/96639
Change subject: Add timeouts to updates and better log failures
......................................................................
Add timeouts to updates and better log failures
$wgCirrusSearchShardTimeout is Elasticsearch's primary shard timeout -
how log it will wait if a shard's primary index is not online. Cirrus
defaults to 1ms assuming that if a shard isn't online we don't have time
to wait for it to get that way. It'll log an error instead. If not set
Elasticsearch defaults to 1 minute.
$wgCirrusSearchClientSideUpdateTimeout is the timeout set on the CURL
requests performing updates to Elasticsearch. It defaults to 5 seconds.
Neither timeout affects maintenance scripts or searches. Maintenance
scripts will wait the full minute. Searches never wait at all and just
return results from the shards that are online.
This also addes a new identifier for wfDebugLog, "CirrusSearchChangeFailed"
to which ids of pages that failed to be deleted or indexed are recorded.
Bug: 57215
Change-Id: I01501673d86c31a3b2c3800c1dc8c263c0b8bebf
---
M CirrusSearch.php
M includes/CirrusSearch.body.php
M includes/CirrusSearchUpdater.php
3 files changed, 101 insertions(+), 23 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch
refs/changes/39/96639/1
diff --git a/CirrusSearch.php b/CirrusSearch.php
index 6791a78..0f3b795 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -51,6 +51,19 @@
// set to 1 for some redundancy, if not 2 for more redundancy.
$wgCirrusSearchContentReplicaCount = array( 'content' => 0, 'general' => 0 );
+// Shard timeout for non-maintenance index operations including those done in
the web
+// process and those done via job queue. This is the amount of time
Elasticsearch
+// will wait around for an offline primary shard. Currently this is just used
in
+// page updates and not deletes. If this is specified then page updates
cannot use
+// the bulk api so they will be less efficient. Luckily, this isn't used in
+// maintenance scripts which really need bulk operations. Cirrus defaults to
a very
+// tiny value to prevent folks from waiting around for updates.
+$wgCirrusSearchShardTimeout = '1ms';
+
+// Client side timeout for non-maintenance index and delete operations and
freshness
+// checks in seconds.
+$wgCirrusSearchClientSideUpdateTimeout = 5;
+
// Is it ok if the prefix starts on any word in the title or just the first
word?
// Defaults to false (first word only) because that is the wikipedia behavior
and so
// what we expect users to expect. Does not effect the prefix: search filter
or
diff --git a/includes/CirrusSearch.body.php b/includes/CirrusSearch.body.php
index d5f0611..e72571e 100644
--- a/includes/CirrusSearch.body.php
+++ b/includes/CirrusSearch.body.php
@@ -75,7 +75,9 @@
}
public function delete( $id, $title ) {
- CirrusSearchUpdater::deletePages( array( $id ) );
+ global $wgCirrusSearchClientSideUpdateTimeout;
+
+ CirrusSearchUpdater::deletePages( array( $id ),
$wgCirrusSearchClientSideUpdateTimeout );
}
public function getTextFromContent( Title $t, Content $c = null,
$parserOutput = null ) {
diff --git a/includes/CirrusSearchUpdater.php b/includes/CirrusSearchUpdater.php
index 5863263..08c7ae1 100644
--- a/includes/CirrusSearchUpdater.php
+++ b/includes/CirrusSearchUpdater.php
@@ -36,6 +36,8 @@
* Update a single article using its Title and pre-sanitized text.
*/
public static function updateFromTitleAndText( $id, $title, $text ) {
+ global $wgCirrusSearchShardTimeout,
$wgCirrusSearchClientSideUpdateTimeout;
+
if ( in_array( $id, self::$updated ) ) {
// Already indexed $id
return;
@@ -60,7 +62,7 @@
} else {
self::updatePages( array( array(
'page' => $page,
- ) ) );
+ ) ), false, $wgCirrusSearchShardTimeout,
$wgCirrusSearchClientSideUpdateTimeout );
}
}
@@ -124,10 +126,22 @@
* 'skip-parse' => true, # Don't parse the page, just update link
counts. Optional.
* )
* )
+ * @param boolean $checkFreshness Should we check if Elasticsearch
already has
+ * up to date copy of the document before sending it?
+ * @param null|string $shardTimeout How long should elaticsearch wait
for an offline
+ * shard. Defaults to null, meaning don't wait. Null is more
efficient when sending
+ * multiple pages because Cirrus will use Elasticsearch's bulk API.
Timeout is in
+ * Elasticsearch's time format.
+ * @param null|int $clientSideTimeout timeout in seconds to update
pages or null if using
+ * the Elastica default which is 300 seconds.
*/
- public static function updatePages( $pageData, $checkFreshness = false
) {
+ public static function updatePages( $pageData, $checkFreshness = false,
$shardTimeout = null,
+ $clientSideTimeout = null) {
wfProfileIn( __METHOD__ );
+ if ( $clientSideTimeout !== null ) {
+ CirrusSearchConnection::setTimeout( $clientSideTimeout
);
+ }
$contentDocuments = array();
$generalDocuments = array();
foreach ( $pageData as $page ) {
@@ -144,8 +158,8 @@
$generalDocuments[] = $document;
}
}
- self::sendDocuments(
CirrusSearchConnection::CONTENT_INDEX_TYPE, $contentDocuments );
- self::sendDocuments(
CirrusSearchConnection::GENERAL_INDEX_TYPE, $generalDocuments );
+ self::sendDocuments(
CirrusSearchConnection::CONTENT_INDEX_TYPE, $contentDocuments, $shardTimeout );
+ self::sendDocuments(
CirrusSearchConnection::GENERAL_INDEX_TYPE, $generalDocuments, $shardTimeout );
$count = count( $contentDocuments ) + count( $generalDocuments
);
wfProfileOut( __METHOD__ );
@@ -156,12 +170,16 @@
* Delete pages from the elasticsearch index
*
* @param array $pages An array of ids to delete
+ * @param null|int $clientSideTimeout timeout in seconds to update
pages or null if using
+ * the Elastica default which is 300 seconds.
*/
- public static function deletePages( $pages ) {
+ public static function deletePages( $pages, $clientSideTimeout = null )
{
wfProfileIn( __METHOD__ );
- self::sendDeletes( CirrusSearchConnection::CONTENT_INDEX_TYPE,
$pages );
- self::sendDeletes( CirrusSearchConnection::GENERAL_INDEX_TYPE,
$pages );
+ if ( $clientSideTimeout !== null ) {
+ CirrusSearchConnection::setTimeout( $clientSideTimeout
);
+ }
+ self::sendDeletes( $pages );
wfProfileOut( __METHOD__ );
}
@@ -186,29 +204,62 @@
}
/**
- * @param $indexType
- * @param $documents array
+ * @param string $indexType type of index to which to send $documents
+ * @param array $documents documents to send
+ * @param null|string $shardTimeout How long should elaticsearch wait
for an offline
+ * shard. Defaults to null, meaning don't wait. Null is more
efficient when sending
+ * multiple pages because Cirrus will use Elasticsearch's bulk API.
Timeout is in
+ * Elasticsearch's time format.
*/
- private static function sendDocuments( $indexType, $documents ) {
+ private static function sendDocuments( $indexType, $documents,
$shardTimeout = null ) {
wfProfileIn( __METHOD__ );
$documentCount = count( $documents );
if ( $documentCount === 0 ) {
return;
}
- wfDebugLog( 'CirrusSearch', "Sending $documentCount documents
to the $indexType index." );
$work = new PoolCounterWorkViaCallback( 'CirrusSearch-Update',
"_elasticsearch", array(
- 'doWork' => function() use ( $indexType, $documents ) {
+ 'doWork' => function() use ( $indexType, $documents,
$documentCount, $shardTimeout ) {
try {
- $result =
CirrusSearchConnection::getPageType( $indexType )->addDocuments( $documents );
- wfDebugLog( 'CirrusSearch', 'Update
completed in ' . $result->getEngineTime() . ' (engine) millis' );
+ $pageType =
CirrusSearchConnection::getPageType( $indexType );
+ // The bulk api doesn't support
shardTimeout so don't use it if one is set
+ if ( $shardTimeout === null ) {
+ $start = microtime();
+ wfDebugLog( 'CirrusSearch',
"Sending $documentCount documents to the $indexType index via bulk api." );
+ // addDocuments (notice plural)
is the bulk api
+ $pageType->addDocuments(
$documents );
+ $took = round( ( microtime() -
$start ) * 1000 );
+ wfDebugLog( 'CirrusSearch',
"Update completed in $took millis" );
+ } else {
+ foreach ( $documents as
$document ) {
+ $start = microtime();
+ wfDebugLog(
'CirrusSearch', 'Sending id=' . $document->getId() . " to the $indexType
index." );
+ $document->setTimeout(
$shardTimeout );
+ // The bulk api
automatically performs an update if the opType is update but the index api
+ // doesn't so we have
to make the switch ourselves
+ if (
$document->getOpType() === 'update' ) {
+
$pageType->updateDocument( $document );
+ } else {
+ // addDocument
(notice singular) is the non-bulk index api
+
$pageType->addDocument( $document );
+ }
+ $took = round( (
microtime() - $start ) * 1000 );
+ wfDebugLog(
'CirrusSearch', "Update completed in $took millis" );
+ }
+ }
} catch (
\Elastica\Exception\ExceptionInterface $e ) {
- error_log( "CirrusSearch update failed
caused by: " . $e->getMessage() );
+ wfLogWarning( 'CirrusSearch update
failed caused by: ' . $e->getMessage() );
+ foreach ( $documents as $document ) {
+ wfDebugLog(
'CirrusSearchChangeFailed', 'Update: ' . $document->getId() );
+ }
}
},
'error' => function( $status ) {
$status = $status->getErrorsArray();
wfLogWarning( 'Pool error sending documents to
Elasticsearch: ' . $status[ 0 ][ 0 ] );
+ foreach ( $documents as $document ) {
+ wfDebugLog( 'CirrusSearchChangeFailed',
'Update: ' . $document->getId() );
+ }
return false;
}
) );
@@ -243,6 +294,7 @@
$doc->setDocAsUpsert( true );
$doc->setOpType( 'update' );
} else {
+ $doc->setOpType( 'index' );
$parserOutput = $page->getParserOutput( new
ParserOptions(), $page->getRevision()->getId() );
$text = Sanitizer::stripAllTags( SearchEngine::create(
'CirrusSearch' )
->getTextFromContent( $title,
$page->getContent(), $parserOutput ) );
@@ -364,6 +416,7 @@
*/
public static function updateLinkedArticles( $addedLinks, $removedLinks
) {
global $wgCirrusSearchLinkedArticlesToUpdate,
$wgCirrusSearchUnlinkedArticlesToUpdate;
+ global $wgCirrusSearchShardTimeout,
$wgCirrusSearchClientSideUpdateTimeout;
$titles = array_merge(
self::pickFromArray( $addedLinks,
$wgCirrusSearchLinkedArticlesToUpdate ),
@@ -401,7 +454,7 @@
'skip-parse' => true, // Just update link
counts
);
}
- self::updatePages( $pages );
+ self::updatePages( $pages, false, $wgCirrusSearchShardTimeout,
$wgCirrusSearchClientSideUpdateTimeout );
}
/**
@@ -430,29 +483,39 @@
}
/**
- * @param $indexType
* @param $ids array
*/
- private static function sendDeletes( $indexType, $ids ) {
+ private static function sendDeletes( $ids ) {
wfProfileIn( __METHOD__ );
$idCount = count( $ids );
if ( $idCount === 0 ) {
return;
}
- wfDebugLog( 'CirrusSearch', "Sending $idCount deletes to the
$indexType index." );
+ wfDebugLog( 'CirrusSearch', "Sending $idCount deletes to the
index." );
$work = new PoolCounterWorkViaCallback( 'CirrusSearch-Update',
"_elasticsearch", array(
'doWork' => function() use ( $indexType, $ids ) {
try {
- $result =
CirrusSearchConnection::getPageType( $indexType )->deleteIds( $ids );
- wfDebugLog( 'CirrusSearch', 'Delete
completed in ' . $result->getEngineTime() . ' (engine) millis' );
+ $start = microtime();
+ CirrusSearchConnection::getPageType(
CirrusSearchConnection::CONTENT_INDEX_TYPE )
+ ->deleteIds( $ids );
+ CirrusSearchConnection::getPageType(
CirrusSearchConnection::GENERAL_INDEX_TYPE )
+ ->deleteIds( $ids );
+ $took = round( ( microtime() - $start )
* 1000 );
+ wfDebugLog( 'CirrusSearch', "Delete
completed in $took millis" );
} catch (
\Elastica\Exception\ExceptionInterface $e ) {
- error_log( "CirrusSearch delete failed
caused by: " . $e->getMessage() );
+ wfLogWarning( "CirrusSearch delete
failed caused by: " . $e->getMessage() );
+ foreach ( $ids as $id ) {
+ wfDebugLog(
'CirrusSearchChangeFailed', "Delete: $id" );
+ }
}
},
'error' => function( $status ) {
$status = $status->getErrorsArray();
wfLogWarning( 'Pool error sending deletes to
Elasticsearch: ' . $status[ 0 ][ 0 ] );
+ foreach ( $ids as $id ) {
+ wfDebugLog( 'CirrusSearchChangeFailed',
"Delete: $id" );
+ }
return false;
}
) );
--
To view, visit https://gerrit.wikimedia.org/r/96639
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I01501673d86c31a3b2c3800c1dc8c263c0b8bebf
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <[email protected]>
Gerrit-Reviewer: jenkins-bot
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits