jenkins-bot has submitted this change and it was merged. Change subject: Optimize saneitize.php ......................................................................
Optimize saneitize.php Fetching multiple pages at the same time should speedup the whole process. Bug: T137113 Change-Id: I95ca761b855e361581e7b3704f90c14257338c53 --- M includes/Connection.php M includes/Sanity/Checker.php M includes/Searcher.php M maintenance/saneitize.php 4 files changed, 212 insertions(+), 58 deletions(-) Approvals: Cindy-the-browser-test-bot: Looks good to me, but someone else must approve EBernhardson: Looks good to me, approved jenkins-bot: Verified diff --git a/includes/Connection.php b/includes/Connection.php index d8adbc9..461eb86 100644 --- a/includes/Connection.php +++ b/includes/Connection.php @@ -302,6 +302,23 @@ } } + /** + * @param int[]|null $namespaces List of namespaces to check + * @return string[] the list of all index suffixes mathing the namespaces + */ + public function getAllIndexSuffixesForNamespaces( $namespaces = null ) { + if ( $namespaces ) { + foreach ( $namespaces as $namespace ) { + $indexTypes[] = $this->getIndexSuffixForNamespace( $namespace ); + } + return array_unique( $indexTypes ); + } + // If no namespaces provided all indices are needed + $mappings = $this->config->get( 'CirrusSearchNamespaceMappings' ); + return array_merge( array( self::CONTENT_INDEX_TYPE, self::GENERAL_INDEX_TYPE ), + array_values( $mappings ) ); + } + public function destroyClient() { self::$pool = array(); parent::destroyClient(); diff --git a/includes/Sanity/Checker.php b/includes/Sanity/Checker.php index f31efef..ab99cb1 100644 --- a/includes/Sanity/Checker.php +++ b/includes/Sanity/Checker.php @@ -4,6 +4,7 @@ use CirrusSearch\Connection; use CirrusSearch\Searcher; +use MediaWiki\MediaWikiServices; use Status; use Title; use WikiPage; @@ -66,60 +67,160 @@ /** * Check if a title is insane. * - * @param int $pageId page to check - * @return Status status of the operation + * @param int[] $pageIds page to check + * @return int the number of pages updated */ - public function check( $pageId ) { - $status = $this->searcher->get( array( $pageId ), array( 'namespace', 'title' ) ); - if ( !$status->isOK() ) { - return $status; - } - $fromIndex = $status->getValue(); - - $inIndex = count( $fromIndex ) > 0; - $page = WikiPage::newFromID( $pageId ); - - if ( $page !== null && $page->exists() ) { - if ( $page->isRedirect() ) { - if ( $inIndex ) { - $this->remediator->redirectInIndex( $page ); - } else { - $this->sane( $pageId, 'Redirect not in index' ); - } - } else { - if ( $inIndex ) { - $foundInsanityInIndex = false; - $expectedType = $this->connection->getIndexSuffixForNamespace( $page->getTitle()->getNamespace() ); - /** @var IndexInfo $indexInfo */ - foreach ( $fromIndex as $indexInfo ) { - $matches = array(); - if ( !preg_match( '/_(.+)_.+$/', $indexInfo->getIndex(), $matches ) ) { - return Status::newFatal( "Can't parse index name: " . $indexInfo->getIndex() ); - } - $type = $matches[ 1 ]; - if ( $type !== $expectedType ) { - // Got to grab the index type from the index name.... - $this->remediator->pageInWrongIndex( $page, $type ); - $foundInsanityInIndex = true; - } - } - if ( !$foundInsanityInIndex ) { - $this->sane( $pageId, 'Page in index' ); - } - } else { - $this->remediator->pageNotInIndex( $page ); - } + public function check( array $pageIds ) { + $pagesFromDb = $this->loadPagesFromDB( $pageIds ); + $pagesFromIndex = $this->loadPagesFromIndex( $pageIds ); + $nbPagesFixed = 0; + foreach( $pageIds as $pageId ) { + $fromIndex = array(); + if ( isset( $pagesFromIndex[$pageId] ) ) { + $fromIndex = $pagesFromIndex[$pageId]; } - } else { + + $updated = false; + if ( isset ( $pagesFromDb[$pageId] ) ) { + $page = $pagesFromDb[$pageId]; + $updated = $this->checkExisitingPage( $pageId, $page, $fromIndex ); + } else { + $updated = $this->checkInexistentPage( $pageId, $fromIndex ); + } + if( $updated ) { + $nbPagesFixed++; + } + } + $clusterName = $this->connection->getClusterName(); + $stats = MediaWikiServices::getInstance()->getStatsdDataFactory(); + $stats->updateCount( "CirrusSearch.$clusterName.sanitization.fixed", $nbPagesFixed ); + $stats->updateCount( "CirrusSearch.$clusterName.sanitization.checked", count( $pageIds ) ); + return $nbPagesFixed; + } + + /** + * Check that an existing page is properly indexed: + * - index it if missing in the index + * - delete it if it's a redirect + * - verify it if found in the index + * + * @param int $pageId + * @param WikiPage $page + * @param \Elastica\Result[] $fromIndex + * @return bool true if a modification was needed + */ + private function checkExisitingPage( $pageId, $page, $fromIndex ) { + $inIndex = count( $fromIndex ) > 0; + if ( $page->isRedirect() ) { if ( $inIndex ) { - $r = $fromIndex[ 0 ]; + $this->remediator->redirectInIndex( $page ); + return true; + } + $this->sane( $pageId, 'Redirect not in index' ); + return false; + } + if ( $inIndex ) { + return $this->checkIndexMismatch( $pageId, $page, $fromIndex ); + } + $this->remediator->pageNotInIndex( $page ); + return true; + } + + /** + * Check that an inexistent page is not present in the index + * and delete it if found + * + * @param int $pageId + * @param WikiPage $page + * @param \Elastica\Result[] $fromIndex + * @return bool true if a modification was needed + */ + private function checkInexistentPage( $pageId, $fromIndex ) { + $inIndex = count( $fromIndex ) > 0; + if ( $inIndex ) { + foreach( $fromIndex as $r ) { $title = Title::makeTitle( $r->namespace, $r->title ); $this->remediator->ghostPageInIndex( $pageId, $title ); - } else { - $this->sane( $pageId, 'No ghost' ); + } + return true; + } + $this->sane( $pageId, 'No ghost' ); + return false; + } + + + /** + * Check that a page present in the db and in the index + * is properly indexed to the appropriate index by checking its + * namespace. + * + * @param int $pageId + * @param WikiPage $page + * @param \Elastica\Result[] $fromIndex + * @return bool true if a modification was needed + */ + private function checkIndexMismatch( $pageId, $page, $fromIndex ) { + $foundInsanityInIndex = false; + $expectedType = $this->connection->getIndexSuffixForNamespace( $page->getTitle()->getNamespace() ); + foreach ( $fromIndex as $indexInfo ) { + $matches = array(); + if ( !preg_match( '/_(.+)_.+$/', $indexInfo->getIndex(), $matches ) ) { + throw new \Exception( "Can't parse index name: " . $indexInfo->getIndex() ); + } + $type = $matches[ 1 ]; + if ( $type !== $expectedType ) { + // Got to grab the index type from the index name.... + $this->remediator->pageInWrongIndex( $page, $type ); + $foundInsanityInIndex = true; } } - return Status::newGood(); + if ( $foundInsanityInIndex ) { + return true; + } + $this->sane( $pageId, 'Page in index' ); + return false; + } + + + /** + * @param int[] $ids page ids + * @return WikiPage[] the list of wiki pages indexed in page id + */ + private function loadPagesFromDB( array $ids ) { + $pages = array(); + $dbr = wfGetDB( DB_SLAVE ); + $where = 'page_id IN (' . $dbr->makeList( $ids ) . ')'; + $res = $dbr->select( + array( 'page' ), + WikiPage::selectFields(), + $where, + __METHOD__ + ); + foreach ( $res as $row ) { + $page = WikiPage::newFromRow( $row, $dbr ); + $pages[$page->getId()] = $page; + } + return $pages; + } + + /** + * @param int[] $ids page ids + * @return \Elastica\Result[][] search results indexed by page id + * @throws \Exception if an error occurred + */ + private function loadPagesFromIndex( array $ids ) { + $status = $this->searcher->get( $ids, array( 'namespace', 'title' ) ); + if ( !$status->isOK() ) { + throw new \Exception( 'Cannot fetch ids from index' ); + } + /** @var \Elastica\ResultSet $indexInfo */ + $dataFromIndex = $status->getValue(); + + $indexedPages = array(); + foreach ( $dataFromIndex as $indexInfo ) { + $indexedPages[$indexInfo->getId()][] = $indexInfo; + } + return $indexedPages; } private function sane( $pageId, $reason ) { diff --git a/includes/Searcher.php b/includes/Searcher.php index 3e06ad2..52926d4 100644 --- a/includes/Searcher.php +++ b/includes/Searcher.php @@ -901,10 +901,17 @@ $indexType = $this->connection->pickIndexTypeForNamespaces( $this->getNamespaces() ); + + // The worst case would be to have all ids duplicated in all available indices. + // We set the limit accordingly + $size = count ( $this->connection->getAllIndexSuffixesForNamespaces( + $this->getNamespaces() + )); + $size *= count( $pageIds ); return Util::doPoolCounterWork( 'CirrusSearch-Search', $this->user, - function() use ( $pageIds, $sourceFiltering, $indexType ) { + function() use ( $pageIds, $sourceFiltering, $indexType, $size ) { try { $this->start( "get of {indexType}.{pageIds}", array( 'indexType' => $indexType, @@ -913,10 +920,18 @@ ) ); // Shard timeout not supported on get requests so we just use the client side timeout $this->connection->setTimeout( $this->config->getElement( 'CirrusSearchClientSideSearchTimeout', 'default' ) ); + // We use a search query instead of _get/_mget, these methods are + // theorically well suited for this kind of job but they are not + // supported on aliases with multiple indices (content/general) $pageType = $this->connection->getPageType( $this->indexBaseName, $indexType ); $query = new \Elastica\Query( new \Elastica\Query\Ids( null, $pageIds ) ); $query->setParam( '_source', $sourceFiltering ); $query->addParam( 'stats', 'get' ); + // We ignore limits provided to the searcher + // otherwize we could return fewer results than + // the ids requested. + $query->setFrom( 0 ); + $query->setSize( $size ); $resultSet = $pageType->search( $query, array( 'search_type' => 'query_then_fetch' ) ); return $this->success( $resultSet->getResults() ); } catch ( \Elastica\Exception\NotFoundException $e ) { diff --git a/maintenance/saneitize.php b/maintenance/saneitize.php index e85f74a..e8a20e7 100644 --- a/maintenance/saneitize.php +++ b/maintenance/saneitize.php @@ -54,6 +54,7 @@ public function __construct() { parent::__construct(); + $this->setBatchSize( 10 ); $this->mDescription = "Make the index sane. Always operates on a single cluster."; $this->addOption( 'fromId', 'Start sanitizing at a specific page_id. Default to 0.', false, true ); $this->addOption( 'toId', 'Stop sanitizing at a specific page_id. Default to the maximum id in the db + 100.', false, true ); @@ -75,6 +76,15 @@ $this->getConnection()->setTimeout( $wgCirrusSearchMaintenanceTimeout ); $wgCirrusSearchClientSideUpdateTimeout = $wgCirrusSearchMaintenanceTimeout; + if ( $this->hasOption( 'batch-size' ) ) { + $this->setBatchSize( $this->getOption( 'batch-size' ) ); + if ( $this->mBatchSize > 5000 ) { + $this->error( "--batch-size too high!", 1 ); + } elseif ( $this->mBatchSize <= 0 ) { + $this->error( "--batch-size must be > 0!", 1 ); + } + + } $this->setFromAndTo(); $buildChunks = $this->getOption( 'buildChunks'); if ( $buildChunks ) { @@ -83,20 +93,31 @@ return; } $this->buildChecker(); - $this->check(); + $updated = $this->check(); + $this->output( "Fixed $updated page(s) (" . ( $this->toId - $this->fromId ) . " checked)" ); } + /** + * @return int the number of pages corrected + */ private function check() { - for ( $pageId = $this->fromId; $pageId <= $this->toId; $pageId++ ) { - $status = $this->checker->check( $pageId ); - if ( !$status->isOK() ) { - $this->error( $status->getWikiText(), 1 ); - } - if ( ( $pageId - $this->fromId ) % 100 === 0 ) { - $this->output( sprintf( "[%20s]%10d/%d\n", wfWikiID(), $pageId, - $this->toId ) ); - } + $updated = 0; + for ( $pageId = $this->fromId; $pageId <= $this->toId; $pageId += $this->mBatchSize ) { + $max = min( $this->toId, $pageId + $this->mBatchSize - 1 ); + $updated += $this->checkChunk( range( $pageId, $max ) ); } + return $updated; + } + + /** + * @param int[] $ids + * @return int number of pages corrected + */ + private function checkChunk( array $ids ) { + $updated = $this->checker->check( $ids ); + $this->output( sprintf( "[%20s]%10d/%d\n", wfWikiID(), end( $ids ), + $this->toId ) ); + return $updated; } private function setFromAndTo() { -- To view, visit https://gerrit.wikimedia.org/r/293503 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I95ca761b855e361581e7b3704f90c14257338c53 Gerrit-PatchSet: 8 Gerrit-Project: mediawiki/extensions/CirrusSearch Gerrit-Branch: master Gerrit-Owner: DCausse <dcau...@wikimedia.org> Gerrit-Reviewer: Cindy-the-browser-test-bot <bernhardsone...@gmail.com> Gerrit-Reviewer: DCausse <dcau...@wikimedia.org> Gerrit-Reviewer: EBernhardson <ebernhard...@wikimedia.org> Gerrit-Reviewer: Gehel <gleder...@wikimedia.org> Gerrit-Reviewer: Manybubbles <never...@wikimedia.org> Gerrit-Reviewer: Smalyshev <smalys...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits