jenkins-bot has submitted this change and it was merged.

Change subject: Optimize in place reindexing.
......................................................................


Optimize in place reindexing.

1.  Turn replicas on _after_ building the new index.
2.  Optimize the index before adding the replicas.  Might help some with
speed but helps more with query performance on closed indexes.
3.  Build the index in multiple processes.  Default to doing this on
linux and not on Windows.
4.  Use the 'create' action instead of the 'index' action.

Bug: 54918
Change-Id: I51101191a9e1daac794d027af23bbf6a0fb096ab
---
M CirrusSearch.php
A includes/CirrusSearchReindexForkController.php
M maintenance/updateOneSearchIndexConfig.php
3 files changed, 238 insertions(+), 50 deletions(-)

Approvals:
  Chad: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/CirrusSearch.php b/CirrusSearch.php
index 54764e6..8d0d4f9 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -101,18 +101,19 @@
 // Changing it requires an in place reindex to take effect.  Currently only 
available in English.
 $wgCirrusSearchUseAggressiveSplitting = true;
 
-$dir = __DIR__ . '/';
+$includes = __DIR__ . "/includes/";
 /**
  * Classes
  */
-$wgAutoloadClasses['CirrusSearch'] = $dir . 'includes/CirrusSearch.body.php';
-$wgAutoloadClasses['CirrusSearchAnalysisConfigBuilder'] = $dir . 
'includes/CirrusSearchAnalysisConfigBuilder.php';
-$wgAutoloadClasses['CirrusSearchConnection'] = $dir . 
'includes/CirrusSearchConnection.php';
-$wgAutoloadClasses['CirrusSearchMappingConfigBuilder'] = $dir . 
'includes/CirrusSearchMappingConfigBuilder.php';
-$wgAutoloadClasses['CirrusSearchPrefixSearchHook'] = $dir . 
'includes/CirrusSearchPrefixSearchHook.php';
-$wgAutoloadClasses['CirrusSearchSearcher'] = $dir . 
'includes/CirrusSearchSearcher.php';
-$wgAutoloadClasses['CirrusSearchTextFormatter'] = $dir . 
'includes/CirrusSearchTextFormatter.php';
-$wgAutoloadClasses['CirrusSearchUpdater'] = $dir . 
'includes/CirrusSearchUpdater.php';
+$wgAutoloadClasses['CirrusSearch'] = $includes . 'CirrusSearch.body.php';
+$wgAutoloadClasses['CirrusSearchAnalysisConfigBuilder'] = $includes . 
'CirrusSearchAnalysisConfigBuilder.php';
+$wgAutoloadClasses['CirrusSearchConnection'] = $includes . 
'CirrusSearchConnection.php';
+$wgAutoloadClasses['CirrusSearchMappingConfigBuilder'] = $includes . 
'CirrusSearchMappingConfigBuilder.php';
+$wgAutoloadClasses['CirrusSearchPrefixSearchHook'] = $includes . 
'CirrusSearchPrefixSearchHook.php';
+$wgAutoloadClasses['CirrusSearchReindexForkController'] = $includes . 
'CirrusSearchReindexForkController.php';
+$wgAutoloadClasses['CirrusSearchSearcher'] = $includes . 
'CirrusSearchSearcher.php';
+$wgAutoloadClasses['CirrusSearchTextFormatter'] = $includes . 
'CirrusSearchTextFormatter.php';
+$wgAutoloadClasses['CirrusSearchUpdater'] = $includes . 
'CirrusSearchUpdater.php';
 
 /**
  * Hooks
@@ -123,7 +124,7 @@
 /**
  * i18n
  */
-$wgExtensionMessagesFiles['CirrusSearch'] = $dir . 'CirrusSearch.i18n.php';
+$wgExtensionMessagesFiles['CirrusSearch'] = __DIR__ . '/CirrusSearch.i18n.php';
 
 
 /**
diff --git a/includes/CirrusSearchReindexForkController.php 
b/includes/CirrusSearchReindexForkController.php
new file mode 100644
index 0000000..f6b3803
--- /dev/null
+++ b/includes/CirrusSearchReindexForkController.php
@@ -0,0 +1,61 @@
+<?
+/**
+ * Extensions to ForeController to prepare Elastica and to tell the child
+ * process which one it is.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ */
+class CirrusSearchReindexForkController extends ForkController {
+       /**
+        * @var integer number of this child or null if this is the parent
+        */
+       var $childNumber;
+       /**
+        * Fork a number of worker processes.  Have to hack ForkController to 
store
+        * the child number.
+        *
+        * @return string
+        */
+       protected function forkWorkers( $numProcs ) {
+               $this->prepareEnvironment();
+
+               // Create the child processes
+               for ( $i = 0; $i < $numProcs; $i++ ) {
+                       // Do the fork
+                       $pid = pcntl_fork();
+                       if ( $pid === -1 || $pid === false ) {
+                               echo "Error creating child processes\n";
+                               exit( 1 );
+                       }
+
+                       if ( !$pid ) {
+                               $this->initChild();
+                               $this->childNumber = $i; // Hack right here.
+                               return 'child';
+                       } else {
+                               // This is the parent process
+                               $this->children[$pid] = true;
+                       }
+               }
+
+               return 'parent';
+       }
+
+       protected function prepareEnvironment() {
+               parent::prepareEnvironment();
+               CirrusSearchConnection::destroySingleton();
+       }
+}
diff --git a/maintenance/updateOneSearchIndexConfig.php 
b/maintenance/updateOneSearchIndexConfig.php
index 575cd67..fdd3e41 100644
--- a/maintenance/updateOneSearchIndexConfig.php
+++ b/maintenance/updateOneSearchIndexConfig.php
@@ -44,6 +44,16 @@
        // steps.
        private $removeIndecies = false;
 
+       /**
+        * @var are there too few replicas in the index we're making?
+        */
+       private $tooFewReplicas = false;
+
+       /**
+        * @var number of processes to use when reindexing
+        */
+       private $reindexProcesses;
+
        public function __construct() {
                parent::__construct();
                $this->addDescription( "Update the configuration or contents of 
one search index." );
@@ -78,6 +88,8 @@
                        "reindex all documents from that index (via the alias) 
to this one, swing the " .
                        "alias to this index, and then remove other index.  
You'll have to redo all updates ".
                        "performed during this operation manually.  Defaults to 
false." );
+               $maintenance->addOption( 'reindexProcesses', 'Number of 
processess to use in reindex.  ' .
+                       'Not supported on Windows.  Defaults to 1 on Windows 
and 10 otherwise.', false, true );
        }
 
        public function execute() {
@@ -101,6 +113,7 @@
                        $this->closeOk = $this->getOption( 'closeOk', false );
                        $this->indexIdentifier = 
$this->pickIndexIdentifierFromOption( $this->getOption( 'indexIdentifier', 
'current' ) );
                        $this->reindexAndRemoveOk = $this->getOption( 
'reindexAndRemoveOk', false );
+                       $this->reindexProcesses = $this->getOption( 
'reindexProcesses', wfIsWindows() ? 1 : 10 );
 
                        $this->validateIndex();
                        $this->validateAnalyzers();
@@ -134,10 +147,14 @@
                        return;
                }
                $this->output( $this->indent . "Index exists so 
validating...\n" );
+               $this->validateIndexSettings();
+       }
+
+       private function validateIndexSettings() {
                $settings = $this->getIndex()->getSettings()->get();
 
                $this->output( $this->indent . "\tValidating number of 
shards..." );
-               $actualShardCount = $settings['index.number_of_shards'];
+               $actualShardCount = $settings[ 'index.number_of_shards' ];
                if ( $actualShardCount == $this->getShardCount() ) {
                        $this->output( "ok\n" );
                } else {
@@ -151,7 +168,7 @@
                }
 
                $this->output( $this->indent . "\tValidating number of 
replicas..." );
-               $actualReplicaCount = $settings['index.number_of_replicas'];
+               $actualReplicaCount = $settings[ 'index.number_of_replicas' ];
                if ( $actualReplicaCount == $this->getReplicaCount() ) {
                        $this->output( "ok\n" );
                } else {
@@ -289,8 +306,11 @@
                        }
                } else {
                        foreach ( $status->getIndicesWithAlias( 
$specificAliasName ) as $index ) {
-                               if( $this->getName() === 
$this->getSpecificIndexName() ) {
+                               if( $index->getName() === 
$this->getSpecificIndexName() ) {
                                        $this->output( "ok\n" );
+                                       if ( $this->tooFewReplicas ) {
+                                               $this->validateIndexSettings();
+                                       }
                                        return;
                                } else {
                                        $otherIndeciesWithAlias[] = 
$index->getName();
@@ -301,16 +321,40 @@
                        $this->output( "alias is free..." );
                        $this->getIndex()->addAlias( $specificAliasName, false 
);
                        $this->output( "corrected\n" );
+                       if ( $this->tooFewReplicas ) {
+                               $this->validateIndexSettings();
+                       }
                        return;
                }
                if ( $this->reindexAndRemoveOk ) {
                        $this->output( "is taken...\n" );
-                       $this->output( $this->indent . "\tReindexing...\n");
+                       $this->output( $this->indent . "\tReindexing...\n" );
                        // Muck with $this->indent because reindex is used to 
running at the top level.
                        $saveIndent = $this->indent;
                        $this->indent = $this->indent . "\t\t";
                        $this->reindex();
                        $this->indent = $saveIndent;
+                       if ( $this->tooFewReplicas ) {
+                               // Optimize the index so it'll be more compact 
for replication.  Not required
+                               // but should be helpful.
+                               $this->output( $this->indent . 
"\tOptimizing..." );
+                               $this->getIndex()->optimize( array( 
'max_num_segments' => 5 ) );
+                               $this->output( "Done\n" );
+                               $this->validateIndexSettings();
+                               $this->output( $this->indent . "\tWaiting for 
all shards to start...\n" );
+                               $each = 0;
+                               while ( true ) {
+                                       $unstarted = 
$this->indexShardsUnstarted();
+                                       if ( $each === 0 ) {
+                                               $this->output( $this->indent . 
"\t\t$unstarted remaining\n" );
+                                       }
+                                       if ( $unstarted === 0 ) {
+                                               break;
+                                       }
+                                       $each = ( $each + 1 ) % 20;
+                                       sleep( 1 );
+                               }
+                       }
                        $this->output( $this->indent . "\tSwapping alias...");
                        $this->getIndex()->addAlias( $specificAliasName, true );
                        $this->output( "done\n" );
@@ -324,6 +368,21 @@
                        "--reindexAndRemoveOk.  Make sure you understand the 
consequences of either\n" .
                        "choice." );
                $this->returnCode = 1;
+       }
+
+       private function indexShardsUnstarted() {
+               $data = $this->getIndex()->getStatus()->getData();
+               $count = 0;
+               foreach ( $data[ 'indices' ] as $index ) {
+                       foreach ( $index[ 'shards' ] as $shard ) {
+                               foreach ( $shard as $replica ) {
+                                       if ( $replica[ 'state' ] !== 'STARTED' 
) {
+                                               $count++;
+                                       }
+                               }
+                       }
+               }
+               return $count;
        }
 
        public function validateAllAlias() {
@@ -385,46 +444,111 @@
         * reparsing everything.
         */
        private function reindex() {
-               $query = new Elastica\Query();
-               $query->setFields( array( '_id', '_source' ) );
-
-               // Note here we dump from the current index (using the alias) 
so we can use CirrusSearchConnection::getPageType
-               $result = CirrusSearchConnection::getPageType( $this->indexType 
)->search( $query, array(
-                       'search_type' => 'scan',
-                       'scroll' => '10m',
-                       'size'=> $this->reindexChunkSize / 
$this->getShardCount()
+               $settings = $this->getIndex()->getSettings();
+               $settings->set( array(
+                       'refresh_interval' => -1,           // This is supposed 
to help with bulk index io load.
+                       'merge.policy.merge_factor' => 20,  // This is supposed 
to help with bulk index io load.
                ) );
-               $totalDocsToReindex = $result->getResponse()->getData();
-               $totalDocsToReindex = $totalDocsToReindex['hits']['total'];
-               $this->output( $this->indent . "About to reindex 
$totalDocsToReindex documents\n" );
-               $operationStartTime = microtime( true );
-               $completed = 0;
 
-               while ( true ) {
-                       wfProfileIn( __METHOD__ . '::receiveDocs' );
-                       $result = $this->getIndex()->search( array(), array(
-                               'scroll_id' => 
$result->getResponse()->getScrollId(),
-                               'scroll' => '10m'
-                       ) );
-                       wfProfileOut( __METHOD__ . '::receiveDocs' );
-                       if ( !$result->count() ) {
-                               $this->output( $this->indent . "All done\n" );
+               if ( $this->reindexProcesses > 1 ) {
+                       $fork = new CirrusSearchReindexForkController( 
$this->reindexProcesses );
+                       $forkResult = $fork->start();
+                       switch ( $forkResult ) {
+                       case 'child':
+                               $this->reindexInternal( 
$this->reindexProcesses, $fork->childNumber );
+                               die( 0 );
+                       case 'done':
                                break;
+                       default:
+                               $this->error( "Unexpected result while forking: 
 $forkResult", 1 );
                        }
-                       wfProfileIn( __METHOD__ . '::packageDocs' );
-                       $documents = array();
-                       while ( $result->current() ) {
-                               $documents[] = new \Elastica\Document( 
$result->current()->getId(), $result->current()->getSource() );
-                               $result->next();
+
+                       $this->output( $this->indent . "Verifying counts..." );
+                       $oldCount = CirrusSearchConnection::getPageType( 
$this->indexType )->count();
+                       $this->getIndex()->refresh();
+                       $newCount = $this->getPageType()->count();
+                       if ( $oldCount !== $newCount ) {
+                               $this->output( "Different!  Expected $oldCount 
but got $newCount\n" );
+                               $this->error( "Failed to load index.  Expected 
$oldCount but got $newCount.  Check for warnings above.", 1 );
                        }
-                       wfProfileOut( __METHOD__ . '::packageDocs' );
-                       wfProfileIn( __METHOD__ . '::sendDocs' );
-                       $updateResult = $this->getPageType()->addDocuments( 
$documents );
-                       wfDebugLog( 'CirrusSearch', 'Update completed in ' . 
$updateResult->getEngineTime() . ' (engine) millis' );
-                       wfProfileOut( __METHOD__ . '::sendDocs' );
-                       $completed += $result->count();
-                       $rate = round( $completed / ( microtime( true ) - 
$operationStartTime ) );
-                       $this->output( $this->indent . "Reindexed 
$completed/$totalDocsToReindex documents at $rate/second\n");
+                       $this->output( "done\n" );
+               } else {
+                       $this->reindexInternal( 1, 1 );
+               }
+
+               // Revert settings changed just for reindexing
+               $settings->set( array(
+                       'refresh_interval' => '1s',
+                       'merge.policy.merge_factor' => 10,
+               ) );
+       }
+
+       private function reindexInternal( $children, $childNumber ) {
+               $filter = null;
+               $messagePrefix = "";
+               if ( $childNumber === 1 && $children === 1 ) {
+                       $this->output( $this->indent . "Starting single process 
reindex\n" );
+               } else {
+                       if ( $childNumber >= $children ) {
+                               $this->error( "Invalid parameters - childNumber 
>= children ($childNumber >= $children) ", 1 );
+                       }
+                       $messagePrefix = "[$childNumber] ";
+                       $this->output( $this->indent . $messagePrefix . 
"Starting child process reindex\n" );
+                       // Note that it is not ok to abs(_uid.hashCode) because 
hashCode(Integer.MIN_VALUE) == Integer.MIN_VALUE
+                       $filter = new Elastica\Filter\Script(
+                               "(doc['_uid'].value.hashCode() & 
Integer.MAX_VALUE) % $children == $childNumber" );
+               }
+               try {
+                       $query = new Elastica\Query();
+                       $query->setFields( array( '_id', '_source' ) );
+                       if ( $filter ) {
+                               $query->setFilter( $filter );
+                       }
+
+                       // Note here we dump from the current index (using the 
alias) so we can use CirrusSearchConnection::getPageType
+                       $result = CirrusSearchConnection::getPageType( 
$this->indexType )->search( $query, array(
+                               'search_type' => 'scan',
+                               'scroll' => '1h',
+                               'size'=> $this->reindexChunkSize / 
$this->getShardCount()
+                       ) );
+                       $totalDocsToReindex = $result->getResponse()->getData();
+                       $totalDocsToReindex = 
$totalDocsToReindex['hits']['total'];
+                       $this->output( $this->indent . $messagePrefix . "About 
to reindex $totalDocsToReindex documents\n" );
+                       $operationStartTime = microtime( true );
+                       $completed = 0;
+                       while ( true ) {
+                               wfProfileIn( __METHOD__ . '::receiveDocs' );
+                               $result = $this->getIndex()->search( array(), 
array(
+                                       'scroll_id' => 
$result->getResponse()->getScrollId(),
+                                       'scroll' => '1h'
+                               ) );
+                               wfProfileOut( __METHOD__ . '::receiveDocs' );
+                               if ( !$result->count() ) {
+                                       $this->output( $this->indent . 
$messagePrefix . "All done\n" );
+                                       break;
+                               }
+                               wfProfileIn( __METHOD__ . '::packageDocs' );
+                               $documents = array();
+                               while ( $result->current() ) {
+                                       $document = new \Elastica\Document( 
$result->current()->getId(), $result->current()->getSource() );
+                                       $document->setOpType( 'create' );
+                                       $documents[] = $document;
+                                       $result->next();
+                               }
+                               wfProfileOut( __METHOD__ . '::packageDocs' );
+                               wfProfileIn( __METHOD__ . '::sendDocs' );
+                               $updateResult = 
$this->getPageType()->addDocuments( $documents );
+                               wfDebugLog( 'CirrusSearch', 'Update completed 
in ' . $updateResult->getEngineTime() . ' (engine) millis' );
+                               wfProfileOut( __METHOD__ . '::sendDocs' );
+                               $completed += $result->count();
+                               $rate = round( $completed / ( microtime( true ) 
- $operationStartTime ) );
+                               $this->output( $this->indent . $messagePrefix .
+                                       "Reindexed 
$completed/$totalDocsToReindex documents at $rate/second\n");
+                       }
+               } catch ( \Elastica\Exception\ExceptionInterface $e ) {
+                       // Note that we can't fail the master here, we have to 
check how many documents are in the new index in the master.
+                       wfLogWarning( "Search backend error during reindex.  
Error message is:  " . $e->getMessage() );
+                       die( 1 );
                }
        }
 
@@ -432,11 +556,13 @@
                $this->getIndex()->create( array(
                        'settings' => array(
                                'number_of_shards' => $this->getShardCount(),
-                               'number_of_replicas' => 
$this->getReplicaCount(),
+                               'number_of_replicas' => 
$this->reindexAndRemoveOk ? 0 : $this->getReplicaCount(),
                                'analysis' => 
CirrusSearchAnalysisConfigBuilder::build(),
+                               'translog.flush_threshold_ops' => 50000,   // 
This is supposed to help with bulk index io load.
                        )
                ), $rebuild );
                $this->closeOk = false;
+               $this->tooFewReplicas = $this->reindexAndRemoveOk;
        }
 
        private function closeAndCorrect( $callback ) {

-- 
To view, visit https://gerrit.wikimedia.org/r/88131
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I51101191a9e1daac794d027af23bbf6a0fb096ab
Gerrit-PatchSet: 3
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <[email protected]>
Gerrit-Reviewer: Chad <[email protected]>
Gerrit-Reviewer: Manybubbles <[email protected]>
Gerrit-Reviewer: jenkins-bot

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to