jenkins-bot has submitted this change and it was merged.

Change subject: Optimize saneitize.php
......................................................................


Optimize saneitize.php

Fetching multiple pages at the same time should speedup the whole process.

Bug: T137113
Change-Id: I95ca761b855e361581e7b3704f90c14257338c53
---
M includes/Connection.php
M includes/Sanity/Checker.php
M includes/Searcher.php
M maintenance/saneitize.php
4 files changed, 212 insertions(+), 58 deletions(-)

Approvals:
  Cindy-the-browser-test-bot: Looks good to me, but someone else must approve
  EBernhardson: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/includes/Connection.php b/includes/Connection.php
index d8adbc9..461eb86 100644
--- a/includes/Connection.php
+++ b/includes/Connection.php
@@ -302,6 +302,23 @@
                }
        }
 
+       /**
+        * @param int[]|null $namespaces List of namespaces to check
+        * @return string[] the list of all index suffixes mathing the 
namespaces
+        */
+       public function getAllIndexSuffixesForNamespaces( $namespaces = null ) {
+               if ( $namespaces ) {
+                       foreach ( $namespaces as $namespace ) {
+                               $indexTypes[] = 
$this->getIndexSuffixForNamespace( $namespace );
+                       }
+                       return array_unique( $indexTypes );
+               }
+               // If no namespaces provided all indices are needed
+               $mappings = $this->config->get( 'CirrusSearchNamespaceMappings' 
);
+               return array_merge( array( self::CONTENT_INDEX_TYPE, 
self::GENERAL_INDEX_TYPE ),
+                       array_values( $mappings ) );
+       }
+
        public function destroyClient() {
                self::$pool = array();
                parent::destroyClient();
diff --git a/includes/Sanity/Checker.php b/includes/Sanity/Checker.php
index f31efef..ab99cb1 100644
--- a/includes/Sanity/Checker.php
+++ b/includes/Sanity/Checker.php
@@ -4,6 +4,7 @@
 
 use CirrusSearch\Connection;
 use CirrusSearch\Searcher;
+use MediaWiki\MediaWikiServices;
 use Status;
 use Title;
 use WikiPage;
@@ -66,60 +67,160 @@
        /**
         * Check if a title is insane.
         *
-        * @param int $pageId page to check
-        * @return Status status of the operation
+        * @param int[] $pageIds page to check
+        * @return int the number of pages updated
         */
-       public function check( $pageId ) {
-               $status = $this->searcher->get( array( $pageId ), array( 
'namespace', 'title' ) );
-               if ( !$status->isOK() ) {
-                       return $status;
-               }
-               $fromIndex = $status->getValue();
-
-               $inIndex = count( $fromIndex ) > 0;
-               $page = WikiPage::newFromID( $pageId );
-
-               if ( $page !== null && $page->exists() ) {
-                       if ( $page->isRedirect() ) {
-                               if ( $inIndex ) {
-                                       $this->remediator->redirectInIndex( 
$page );
-                               } else {
-                                       $this->sane( $pageId, 'Redirect not in 
index' );
-                               }
-                       } else {
-                               if ( $inIndex ) {
-                                       $foundInsanityInIndex = false;
-                                       $expectedType = 
$this->connection->getIndexSuffixForNamespace( 
$page->getTitle()->getNamespace() );
-                                       /** @var IndexInfo $indexInfo */
-                                       foreach ( $fromIndex as $indexInfo ) {
-                                               $matches = array();
-                                               if ( !preg_match( 
'/_(.+)_.+$/', $indexInfo->getIndex(), $matches ) ) {
-                                                       return 
Status::newFatal( "Can't parse index name:  " . $indexInfo->getIndex() );
-                                               }
-                                               $type = $matches[ 1 ];
-                                               if ( $type !== $expectedType ) {
-                                                       // Got to grab the 
index type from the index name....
-                                                       
$this->remediator->pageInWrongIndex( $page, $type );
-                                                       $foundInsanityInIndex = 
true;
-                                               }
-                                       }
-                                       if ( !$foundInsanityInIndex ) {
-                                               $this->sane( $pageId, 'Page in 
index' );
-                                       }
-                               } else {
-                                       $this->remediator->pageNotInIndex( 
$page );
-                               }
+       public function check( array $pageIds ) {
+               $pagesFromDb = $this->loadPagesFromDB( $pageIds );
+               $pagesFromIndex = $this->loadPagesFromIndex( $pageIds );
+               $nbPagesFixed = 0;
+               foreach( $pageIds as $pageId ) {
+                       $fromIndex = array();
+                       if ( isset( $pagesFromIndex[$pageId] ) ) {
+                               $fromIndex = $pagesFromIndex[$pageId];
                        }
-               } else {
+
+                       $updated = false;
+                       if ( isset ( $pagesFromDb[$pageId] ) ) {
+                               $page = $pagesFromDb[$pageId];
+                               $updated = $this->checkExisitingPage( $pageId, 
$page, $fromIndex );
+                       } else {
+                               $updated = $this->checkInexistentPage( $pageId, 
$fromIndex );
+                       }
+                       if( $updated ) {
+                               $nbPagesFixed++;
+                       }
+               }
+               $clusterName = $this->connection->getClusterName();
+               $stats = 
MediaWikiServices::getInstance()->getStatsdDataFactory();
+               $stats->updateCount( 
"CirrusSearch.$clusterName.sanitization.fixed", $nbPagesFixed );
+               $stats->updateCount( 
"CirrusSearch.$clusterName.sanitization.checked", count( $pageIds ) );
+               return $nbPagesFixed;
+       }
+
+       /**
+        * Check that an existing page is properly indexed:
+        * - index it if missing in the index
+        * - delete it if it's a redirect
+        * - verify it if found in the index
+        *
+        * @param int $pageId
+        * @param WikiPage $page
+        * @param \Elastica\Result[] $fromIndex
+        * @return bool true if a modification was needed
+        */
+       private function checkExisitingPage( $pageId, $page, $fromIndex ) {
+               $inIndex = count( $fromIndex ) > 0;
+               if ( $page->isRedirect() ) {
                        if ( $inIndex ) {
-                               $r = $fromIndex[ 0 ];
+                               $this->remediator->redirectInIndex( $page );
+                               return true;
+                       }
+                       $this->sane( $pageId, 'Redirect not in index' );
+                       return false;
+               }
+               if ( $inIndex ) {
+                       return $this->checkIndexMismatch( $pageId, $page, 
$fromIndex );
+               }
+               $this->remediator->pageNotInIndex( $page );
+               return true;
+       }
+
+       /**
+        * Check that an inexistent page is not present in the index
+        * and delete it if found
+        *
+        * @param int $pageId
+        * @param WikiPage $page
+        * @param \Elastica\Result[] $fromIndex
+        * @return bool true if a modification was needed
+        */
+       private function checkInexistentPage( $pageId, $fromIndex ) {
+               $inIndex = count( $fromIndex ) > 0;
+               if ( $inIndex ) {
+                       foreach( $fromIndex as $r ) {
                                $title = Title::makeTitle( $r->namespace, 
$r->title );
                                $this->remediator->ghostPageInIndex( $pageId, 
$title );
-                       } else {
-                               $this->sane( $pageId, 'No ghost' );
+                       }
+                       return true;
+               }
+               $this->sane( $pageId, 'No ghost' );
+               return false;
+       }
+
+
+       /**
+        * Check that a page present in the db and in the index
+        * is properly indexed to the appropriate index by checking its
+        * namespace.
+        *
+        * @param int $pageId
+        * @param WikiPage $page
+        * @param \Elastica\Result[] $fromIndex
+        * @return bool true if a modification was needed
+        */
+       private function checkIndexMismatch( $pageId, $page, $fromIndex ) {
+               $foundInsanityInIndex = false;
+               $expectedType = $this->connection->getIndexSuffixForNamespace( 
$page->getTitle()->getNamespace() );
+               foreach ( $fromIndex as $indexInfo ) {
+                       $matches = array();
+                       if ( !preg_match( '/_(.+)_.+$/', 
$indexInfo->getIndex(), $matches ) ) {
+                               throw new \Exception( "Can't parse index name:  
" . $indexInfo->getIndex() );
+                       }
+                       $type = $matches[ 1 ];
+                       if ( $type !== $expectedType ) {
+                               // Got to grab the index type from the index 
name....
+                               $this->remediator->pageInWrongIndex( $page, 
$type );
+                               $foundInsanityInIndex = true;
                        }
                }
-               return Status::newGood();
+               if ( $foundInsanityInIndex ) {
+                       return true;
+               }
+               $this->sane( $pageId, 'Page in index' );
+               return false;
+       }
+
+
+       /**
+        * @param int[] $ids page ids
+        * @return WikiPage[] the list of wiki pages indexed in page id
+        */
+       private function loadPagesFromDB( array $ids ) {
+               $pages = array();
+               $dbr = wfGetDB( DB_SLAVE );
+               $where = 'page_id IN (' . $dbr->makeList( $ids ) . ')';
+               $res = $dbr->select(
+                       array( 'page' ),
+                       WikiPage::selectFields(),
+                       $where,
+                       __METHOD__
+               );
+               foreach ( $res as $row ) {
+                       $page = WikiPage::newFromRow( $row, $dbr );
+                       $pages[$page->getId()] = $page;
+               }
+               return $pages;
+       }
+
+       /**
+        * @param int[] $ids page ids
+        * @return \Elastica\Result[][] search results indexed by page id
+        * @throws \Exception if an error occurred
+        */
+       private function loadPagesFromIndex( array $ids ) {
+               $status = $this->searcher->get( $ids, array( 'namespace', 
'title' ) );
+               if ( !$status->isOK() ) {
+                       throw new \Exception( 'Cannot fetch ids from index' );
+               }
+               /** @var \Elastica\ResultSet $indexInfo */
+               $dataFromIndex = $status->getValue();
+
+               $indexedPages = array();
+               foreach ( $dataFromIndex as $indexInfo ) {
+                       $indexedPages[$indexInfo->getId()][] = $indexInfo;
+               }
+               return $indexedPages;
        }
 
        private function sane( $pageId, $reason ) {
diff --git a/includes/Searcher.php b/includes/Searcher.php
index 3e06ad2..52926d4 100644
--- a/includes/Searcher.php
+++ b/includes/Searcher.php
@@ -901,10 +901,17 @@
                $indexType = $this->connection->pickIndexTypeForNamespaces(
                        $this->getNamespaces()
                );
+
+               // The worst case would be to have all ids duplicated in all 
available indices.
+               // We set the limit accordingly
+               $size = count ( 
$this->connection->getAllIndexSuffixesForNamespaces(
+                       $this->getNamespaces()
+               ));
+               $size *= count( $pageIds );
                return Util::doPoolCounterWork(
                        'CirrusSearch-Search',
                        $this->user,
-                       function() use ( $pageIds, $sourceFiltering, $indexType 
) {
+                       function() use ( $pageIds, $sourceFiltering, 
$indexType, $size ) {
                                try {
                                        $this->start( "get of 
{indexType}.{pageIds}", array(
                                                'indexType' => $indexType,
@@ -913,10 +920,18 @@
                                        ) );
                                        // Shard timeout not supported on get 
requests so we just use the client side timeout
                                        $this->connection->setTimeout( 
$this->config->getElement( 'CirrusSearchClientSideSearchTimeout', 'default' ) );
+                                       // We use a search query instead of 
_get/_mget, these methods are
+                                       // theorically well suited for this 
kind of job but they are not
+                                       // supported on aliases with multiple 
indices (content/general)
                                        $pageType = 
$this->connection->getPageType( $this->indexBaseName, $indexType );
                                        $query = new \Elastica\Query( new 
\Elastica\Query\Ids( null, $pageIds ) );
                                        $query->setParam( '_source', 
$sourceFiltering );
                                        $query->addParam( 'stats', 'get' );
+                                       // We ignore limits provided to the 
searcher
+                                       // otherwize we could return fewer 
results than
+                                       // the ids requested.
+                                       $query->setFrom( 0 );
+                                       $query->setSize( $size );
                                        $resultSet = $pageType->search( $query, 
array( 'search_type' => 'query_then_fetch' ) );
                                        return $this->success( 
$resultSet->getResults() );
                                } catch ( \Elastica\Exception\NotFoundException 
$e ) {
diff --git a/maintenance/saneitize.php b/maintenance/saneitize.php
index e85f74a..e8a20e7 100644
--- a/maintenance/saneitize.php
+++ b/maintenance/saneitize.php
@@ -54,6 +54,7 @@
 
        public function __construct() {
                parent::__construct();
+               $this->setBatchSize( 10 );
                $this->mDescription = "Make the index sane. Always operates on 
a single cluster.";
                $this->addOption( 'fromId', 'Start sanitizing at a specific 
page_id.  Default to 0.', false, true );
                $this->addOption( 'toId', 'Stop sanitizing at a specific 
page_id.  Default to the maximum id in the db + 100.', false, true );
@@ -75,6 +76,15 @@
                $this->getConnection()->setTimeout( 
$wgCirrusSearchMaintenanceTimeout );
                $wgCirrusSearchClientSideUpdateTimeout = 
$wgCirrusSearchMaintenanceTimeout;
 
+               if ( $this->hasOption( 'batch-size' ) ) {
+                       $this->setBatchSize( $this->getOption( 'batch-size' ) );
+                       if ( $this->mBatchSize > 5000 ) {
+                               $this->error( "--batch-size too high!", 1 );
+                       } elseif ( $this->mBatchSize <= 0 ) {
+                               $this->error( "--batch-size must be > 0!", 1 );
+                       }
+
+               }
                $this->setFromAndTo();
                $buildChunks = $this->getOption( 'buildChunks');
                if ( $buildChunks ) {
@@ -83,20 +93,31 @@
                        return;
                }
                $this->buildChecker();
-               $this->check();
+               $updated = $this->check();
+               $this->output( "Fixed $updated page(s) (" . ( $this->toId - 
$this->fromId ) . " checked)" );
        }
 
+       /**
+        * @return int the number of pages corrected
+        */
        private function check() {
-               for ( $pageId = $this->fromId; $pageId <= $this->toId; 
$pageId++ ) {
-                       $status = $this->checker->check( $pageId );
-                       if ( !$status->isOK() ) {
-                               $this->error( $status->getWikiText(), 1 );
-                       }
-                       if ( ( $pageId - $this->fromId ) % 100 === 0 ) {
-                               $this->output( sprintf( "[%20s]%10d/%d\n", 
wfWikiID(), $pageId,
-                                       $this->toId ) );
-                       }
+               $updated = 0;
+               for ( $pageId = $this->fromId; $pageId <= $this->toId; $pageId 
+= $this->mBatchSize ) {
+                       $max = min( $this->toId, $pageId + $this->mBatchSize - 
1 );
+                       $updated += $this->checkChunk( range( $pageId, $max ) );
                }
+               return $updated;
+       }
+
+       /**
+        * @param int[] $ids
+        * @return int number of pages corrected
+        */
+       private function checkChunk( array $ids ) {
+               $updated = $this->checker->check( $ids );
+               $this->output( sprintf( "[%20s]%10d/%d\n", wfWikiID(), end( 
$ids ),
+                       $this->toId ) );
+               return $updated;
        }
 
        private function setFromAndTo() {

-- 
To view, visit https://gerrit.wikimedia.org/r/293503
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I95ca761b855e361581e7b3704f90c14257338c53
Gerrit-PatchSet: 8
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: DCausse <dcau...@wikimedia.org>
Gerrit-Reviewer: Cindy-the-browser-test-bot <bernhardsone...@gmail.com>
Gerrit-Reviewer: DCausse <dcau...@wikimedia.org>
Gerrit-Reviewer: EBernhardson <ebernhard...@wikimedia.org>
Gerrit-Reviewer: Gehel <gleder...@wikimedia.org>
Gerrit-Reviewer: Manybubbles <never...@wikimedia.org>
Gerrit-Reviewer: Smalyshev <smalys...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to