jenkins-bot has submitted this change and it was merged.
Change subject: Basic help for multi-process indexing
......................................................................
Basic help for multi-process indexing
Teach forceSearchIndex.php how to spit out commands to run itself in
a chunked mode so you can run it in multiple cpus/machines.
Change-Id: I76d5fde51c7c0bc914274a704e2dfc5eaa37a41e
---
M README
M forceSearchIndex.php
2 files changed, 55 insertions(+), 10 deletions(-)
Approvals:
Demon: Looks good to me, approved
jenkins-bot: Verified
diff --git a/README b/README
index 4dc5d3f..01e89f5 100644
--- a/README
+++ b/README
@@ -30,14 +30,18 @@
Bootstrapping large wikis
-------------------------
-forceSeachIndex.php accepts the --fromId and --toId parameters which can be
used to split up the
-work of bootstrapping the wiki into multiple processes. Most of the load
caused by bootstrapping
-is on MySQL but if you've got a really big database or a few slaves then they
should be able to
-handle a whole bunch of bootstraps at once.
+The --batch-size parameter controls the number documents read from MySQL and
inserted into
+elasticsearch at one time. It defaults to 500 but you should feel free to
play with it.
-Also, the --batch-size parameter controls the number documents read from MySQL
and inserted into
-elasticsearch at one time. It defaults to 500 because that seems to give good
performence (~100 articles
-per second) in development but you should feel free to play with it.
+forceSeachIndex.php accepts the --fromId and --toId parameters which can be
used to split up the
+work of bootstrapping the wiki into multiple processes. Since most of the
load on search indexing is on the
+indexing script in the php process you should be able to break the process
into multiple chunks and farm
+them out to multiple php processes/machines. The --buildChunks argument of
forceSearchIndex.php will cause
+the script to build invocations of itself that you can splay out to different
processes. For example:
+ rm -rf /tmp/index_log
+ mkdir /tmp/index_log
+ php forceSearchIndex.php --buildChunks 10 --batch-size 50 |
+ xargs -I{} -t -P4 sh -c 'php {} > /tmp/index_log/$$.log'
Handling elasticsearch outages
@@ -87,4 +91,4 @@
Licensing information
---------------------
CirrusSearch makes use of the Elastica library to connect to elasticsearch
<http://elastica.io/>.
-It is Apache licensed and you can read the license Elastica/LICENSE.txt.
\ No newline at end of file
+It is Apache licensed and you can read the license Elastica/LICENSE.txt.
diff --git a/forceSearchIndex.php b/forceSearchIndex.php
index 0105930..7a0c75c 100644
--- a/forceSearchIndex.php
+++ b/forceSearchIndex.php
@@ -41,13 +41,15 @@
. "query at the cost of having to reindex by page id
rather than time.\n\n"
. "Note: All froms are _exclusive_ and all tos are
_inclusive_.\n"
. "Note 2: Setting fromId and toId use the efficient
query so those are ok.";
- $this->mBatchSize = 500;
+ $this->setBatchSize( 500 );
$this->addOption( 'from', 'Start date of reindex in
YYYY-mm-ddTHH:mm:ssZ (exc. Defaults to 0 epoch.', false, true );
$this->addOption( 'to', 'Stop date of reindex in
YYYY-mm-ddTHH:mm:ssZ. Defaults to now.', false, true );
$this->addOption( 'fromId', 'Start indexing at a specific
page_id. Not useful with --deletes.', false, true );
$this->addOption( 'toId', 'Stop indexing at a specific page_id.
Note useful with --deletes or --from or --to.', false, true );
$this->addOption( 'deletes', 'If this is set then just index
deletes, not updates or creates.', false );
$this->addOption( 'limit', 'Maximum number of pages to process
before exiting the script. Default to unlimited.', false, true );
+ $this->addOption( 'buildChunks', 'Instead of running the script
spit out N commands that can be farmed out to ' .
+ 'different processes or machines to rebuild the index.
Works with fromId and toId, not from and to.', false, true );
}
public function execute() {
@@ -60,6 +62,11 @@
$this->toId = $this->getOption( 'toId' );
$this->indexUpdates = !$this->getOption( 'deletes', false );
$this->limit = $this->getOption( 'limit' );
+ $buildChunks = $this->getOption( 'buildChunks' );
+ if ( $buildChunks !== null ) {
+ $this->buildChunks( $buildChunks );
+ return;
+ }
if ( $this->indexUpdates ) {
$operationName = 'Indexed';
@@ -143,7 +150,7 @@
$search = SearchEngine::create();
if ( $maxUpdate === null ) {
$toIdPart = '';
- if ( !is_null( $this->toId ) ) {
+ if ( $this->toId !== null ) {
$toId = $dbr->addQuotes( $this->toId );
$toIdPart = " AND page_id <= $toId";
}
@@ -242,6 +249,40 @@
wfProfileOut( __METHOD__ );
return $result;
}
+
+ private function buildChunks( $chunks ) {
+ $dbr = $this->getDB( DB_SLAVE );
+ if ( $this->toId === null ) {
+ $this->toId = $dbr->selectField( 'page', 'MAX(page_id)'
);
+ if ( $this->toId === false ) {
+ $this->error( "Couldn't find any pages to
index. toId = $this->toId.", 1 );
+ }
+ }
+ $fromId = $this->getOption( 'fromId' );
+ if ( $fromId === null ) {
+ $fromId = $dbr->selectField( 'page', 'MIN(page_id) - 1'
);
+ if ( $fromId === false ) {
+ $this->error( "Couldn't find any pages to
index. fromId = $fromId.", 1 );
+ }
+ }
+ if ( $fromId === $this->toId ) {
+ $this->error( "Couldn't find any pages to index.
fromId = $fromId = $this->toId = toId.", 1 );
+ }
+ $chunkSize = max( 1, ceil( ( $this->toId - $fromId ) / $chunks
) );
+ for ( $id = $fromId; $id < $this->toId; $id = $id + $chunkSize
) {
+ $chunkToId = min( $this->toId, $id + $chunkSize );
+ $this->output( $this->mSelf );
+ foreach ( $this->mOptions as $optName => $optVal ) {
+ if ( $optVal === null || $optVal === false ||
$optName === 'fromId' ||
+ $optName === 'toId' || $optName
=== 'buildChunks' ||
+ ($optName === 'memory-limit' &&
$optVal === 'max')) {
+ continue;
+ }
+ $this->output( " --$optName $optVal" );
+ }
+ $this->output( " --fromId $id --toId $chunkToId\n" );
+ }
+ }
}
$maintClass = "ForceSearchIndex";
--
To view, visit https://gerrit.wikimedia.org/r/78834
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I76d5fde51c7c0bc914274a704e2dfc5eaa37a41e
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <[email protected]>
Gerrit-Reviewer: Demon <[email protected]>
Gerrit-Reviewer: Manybubbles <[email protected]>
Gerrit-Reviewer: jenkins-bot
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits