Manybubbles has uploaded a new change for review.
https://gerrit.wikimedia.org/r/113371
Change subject: Make reindex process less brittle
......................................................................
Make reindex process less brittle
1. Catch bulk index failures and retry them as single indexes. This should
help with really really large pages.
2. Catch failures in those single indexes and backoff for some time and
retry.
3. Make the batch size and the number of errors to try to backoff from
configurable.
Change-Id: I77c82dc8dcaf180f4d701d4ea277c1c45262592d
---
M maintenance/updateOneSearchIndexConfig.php
1 file changed, 59 insertions(+), 5 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch
refs/changes/71/113371/1
diff --git a/maintenance/updateOneSearchIndexConfig.php
b/maintenance/updateOneSearchIndexConfig.php
index 962a07f..0d34258 100644
--- a/maintenance/updateOneSearchIndexConfig.php
+++ b/maintenance/updateOneSearchIndexConfig.php
@@ -3,6 +3,7 @@
namespace CirrusSearch;
use Elastica;
use \Maintenance;
+use \ProfileSection;
/**
* Update the search configuration on the search backend.
@@ -43,7 +44,8 @@
// Is the index currently closed?
private $closed = false;
- private $reindexChunkSize = 1000;
+ private $reindexChunkSize;
+ private $reindexRetryAttempts;
private $indexBaseName;
private $indexIdentifier;
@@ -129,6 +131,14 @@
$maintenance->addOption( 'reindexAcceptableCountDeviation',
'How much can the reindexed ' .
'copy of an index is allowed to deviate from the
current copy without triggering a ' .
'reindex failure. Defaults to 5%.', false, true );
+ $maintenance->addOption( 'reindexChunkSize', 'Documents per
shard to reindex in a batch. ' .
+ 'Note when changing the number of shards that the old shard
size is used, not the new ' .
+ 'one. If you see many errors submitting documents in bulk
but the automatic retry as ' .
+ 'singles works then lower this number. Defaults to 100.',
false, true );
+ $maintenance->addOption( 'reindexRetryAttempts', 'Number of
times to back off and retry ' .
+ 'per failure. Note that failures are not common but if
Elasticsearch is in the process ' .
+ 'of moving a shard this can time out. This will retry
the attempt after some backoff ' .
+ 'rather than failing the whole reindex process.
Defaults to 5.', false, true );
$maintenance->addOption( 'baseName', 'What basename to use for
all indexes, ' .
'defaults to wiki id', false, true );
}
@@ -152,6 +162,8 @@
$this->reindexProcesses = $this->getOption( 'reindexProcesses',
wfIsWindows() ? 1 : 10 );
$this->reindexAcceptableCountDeviation =
$this->parsePotentialPercent(
$this->getOption( 'reindexAcceptableCountDeviation',
'5%' ) );
+ $this->reindexChunkSize = $this->getOption( 'reindexChunkSize',
100 );
+ $this->reindexRetryAttempts = $this->getOption(
'reindexRetryAttempts', 5 );
$this->langCode = $wgLanguageCode;
$this->aggressiveSplitting =
$wgCirrusSearchUseAggressiveSplitting;
$this->prefixSearchStartsWithAny =
$wgCirrusSearchPrefixSearchStartsWithAnyWord;
@@ -630,10 +642,7 @@
$result->next();
}
wfProfileOut( __METHOD__ . '::packageDocs' );
- wfProfileIn( __METHOD__ . '::sendDocs' );
- $updateResult =
$this->getPageType()->addDocuments( $documents );
- wfDebugLog( 'CirrusSearch', 'Update completed
in ' . $updateResult->getEngineTime() . ' (engine) millis' );
- wfProfileOut( __METHOD__ . '::sendDocs' );
+ $this->sendDocumentsWithRetry( $messagePrefix,
$documents );
$completed += $result->count();
$rate = round( $completed / ( microtime( true )
- $operationStartTime ) );
$this->output( $this->indent . $messagePrefix .
@@ -646,6 +655,51 @@
}
}
+ private function sendDocumentsWithRetry( $messagePrefix, $documents ) {
+ $profiler = new ProfileSection( __METHOD__ );
+
+ $errors = 0;
+ while ( true ) {
+ if ( $errors < $this->reindexRetryAttempts ) {
+ try {
+ $this->sendDocuments( $messagePrefix,
$documents );
+ return;
+ } catch (
\Elastica\Exception\ExceptionInterface $e ) {
+ $errors += 1;
+ // Random backoff with lowest possible
upper bound as 16 seconds.
+ // With the default mximum number of
errors (5) this maxes out at 256 seconds.
+ $seconds = rand( 1, pow( 2, 3 + $errors
) );
+ $this->output( $this->indent .
$messagePrefix . "Caught an error retrying as singles. " .
+ "Backing off for $seconds and
retrying.\n" );
+ sleep( $seconds );
+ }
+ } else {
+ $this->sendDocuments( $messagePrefix,
$documents );
+ return;
+ }
+ }
+ }
+
+ private function sendDocuments( $messagePrefix, $documents ) {
+ try {
+ $updateResult = $this->getPageType()->addDocuments(
$documents );
+ // if ( rand( 0, 9 ) < 3 ) {
+ // throw new
\Elastica\Exception\InvalidException();
+ // }
+ wfDebugLog( 'CirrusSearch', 'Update completed in ' .
$updateResult->getEngineTime() . ' (engine) millis' );
+ } catch ( \Elastica\Exception\ExceptionInterface $e ) {
+ $this->output( $this->indent . $messagePrefix . "Error
adding documents in bulk. Retrying as singles.\n" );
+ foreach ( $documents as $document ) {
+ // Continue using the bulk api because we're
used to it.
+ $updateResult =
$this->getPageType()->addDocuments( array( $document ) );
+ // if ( rand( 0, 9 ) < 3 ) {
+ // throw new
\Elastica\Exception\InvalidException();
+ // }
+ wfDebugLog( 'CirrusSearch', 'Update completed
in ' . $updateResult->getEngineTime() . ' (engine) millis' );
+ }
+ }
+ }
+
private function createIndex( $rebuild ) {
$analysisConfig = new AnalysisConfigBuilder( $this->langCode,
$this->aggressiveSplitting );
$this->getIndex()->create( array(
--
To view, visit https://gerrit.wikimedia.org/r/113371
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I77c82dc8dcaf180f4d701d4ea277c1c45262592d
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits