Daniel Kinzler has uploaded a new change for review. https://gerrit.wikimedia.org/r/51847
Change subject: Rewrite of rebuildTermSearchKey ...................................................................... Rewrite of rebuildTermSearchKey * added command line control over batch size, start row, etc * use row id for batching * added progress reporting Change-Id: I4d2b9fcaa9d848a96540e51fdc385355e0e466d0 --- M repo/Wikibase.php A repo/includes/store/sql/TermSearchKeyBuilder.php M repo/includes/store/sql/TermSqlCache.php M repo/maintenance/rebuildTermsSearchKey.php 4 files changed, 298 insertions(+), 113 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Wikibase refs/changes/47/51847/1 diff --git a/repo/Wikibase.php b/repo/Wikibase.php index 56187af..0237e6a 100644 --- a/repo/Wikibase.php +++ b/repo/Wikibase.php @@ -182,6 +182,7 @@ $wgAutoloadClasses['Wikibase\TermCache'] = $dir . 'includes/store/TermCache.php'; $wgAutoloadClasses['Wikibase\TermCombinationMatchFinder'] = $dir . 'includes/store/TermCombinationMatchFinder.php'; $wgAutoloadClasses['Wikibase\TermMatchScoreCalculator'] = $dir . 'includes/store/TermMatchScoreCalculator.php'; +$wgAutoloadClasses['Wikibase\TermSearchKeyBuilder'] = $dir . 'includes/store/sql/TermSearchKeyBuilder.php'; // includes/store/sql $wgAutoloadClasses['Wikibase\SqlIdGenerator'] = $dir . 'includes/store/sql/SqlIdGenerator.php'; diff --git a/repo/includes/store/sql/TermSearchKeyBuilder.php b/repo/includes/store/sql/TermSearchKeyBuilder.php new file mode 100644 index 0000000..f6e6248 --- /dev/null +++ b/repo/includes/store/sql/TermSearchKeyBuilder.php @@ -0,0 +1,238 @@ +<?php + +namespace Wikibase; +use Iterator, DatabaseBase; + +/** + * Utility class for rebuilding the term_search_key field. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @since 0.4 + * + * @file + * @ingroup WikibaseRepo + * + * @licence GNU GPL v2+ + * @author Jeroen De Dauw < [email protected] > + * @author Jens Ohlig < [email protected] > + * @author Daniel Kinzler + */ +class TermSearchKeyBuilder { + + /** + * @since 0.4 + * + * @var TermSqlCache $table + */ + protected $table; + + /** + * @since 0.4 + * + * @var MessageReporter $reporter + */ + protected $reporter; + + /** + * Whether all keys should be updated, or only missing keys + * + * @var bool + */ + protected $all = true; + + /** + * Whether all keys should be updated, or only missing keys + * + * @var bool + */ + protected $fromId = 1; + + /** + * The batch size, giving the number of rows to be updated in each database transaction. + * + * @var int + */ + protected $batchSize = 100; + + /** + * Constructor. + * + * @since 0.4 + * + * @param TermSqlCache $table + */ + public function __construct( TermSqlCache $table ) { + $this->table = $table; + } + + /** + * @return boolean + */ + public function getRebuildAll() { + return $this->all; + } + + /** + * @return int + */ + public function getBatchSize() { + return $this->batchSize; + } + + /** + * @return boolean + */ + public function getFromId() { + return $this->fromId; + } + + /** + * @param boolean $all + */ + public function setRebuildAll( $all ) { + $this->all = $all; + } + + /** + * @param int $batchSize + */ + public function setBatchSize( $batchSize ) { + $this->batchSize = $batchSize; + } + + /** + * @param boolean $fromId + */ + public function setFromId( $fromId ) { + $this->fromId = $fromId; + } + + /** + * Sets the reporter to use for reporting preogress. + * + * @param \MessageReporter $reporter + */ + public function setReporter( \MessageReporter $reporter ) { + $this->reporter = $reporter; + } + + /** + * Rebuild the search key field term_search_key from the source term_text field. + * Use the rebuildSearchKey.php maintenance script to invoke this from the command line. + * + * Database updates a batched into multiple transactions. Do not call this + * method whithin an (explicite) database transaction. + * + * @since 0.4 + */ + public function rebuildSearchKey() { + $dbw = $this->table->getWriteDb(); + + $rowId = $this->fromId -1; + + $total = 0; + + while ( true ) { + $dbw->begin(); + + $terms = $dbw->select( + $this->table->getTableName(), + array( + 'term_row_id', + 'term_language', + 'term_text', + ), + array( + 'term_row_id > ' . (int) $rowId, + $this->all ? '1' : 'term_search_key = \'\'', // if not $all, only set missing keys + ), + __METHOD__, + array( + 'LIMIT' => $this->batchSize, + 'ORDER BY term_row_id ASC', + 'FOR UPDATE' + ) + ); + + $c = 0; + + foreach ( $terms as $row ) { + $this->updateSearchKey( $dbw, $row->term_row_id, $row->term_text, $row->term_language ); + $rowId = $row->term_row_id; + $c+= 1; + } + + $dbw->commit(); + + $this->report( "Updated $c search keys, up to row $rowId." ); + $total += $c; + + if ( $c < $this->batchSize ) { + // we are done. + break; + } + } + + return $total; + } + + /** + * Updates a single row with a newley calculated search key. + * The search key is calculated using Term::normalizeText(). + * + * @see Term::normalizeText + * + * @since 0.4 + * + * @param \DatabaseBase $dbw the database connection to use + * @param int $rowId the row to update + * @param string $text the term's text + * @param string $lang the term's language + * + * @return string the search key + */ + protected function updateSearchKey( \DatabaseBase $dbw, $rowId, $text, $lang ) { + $key = Term::normalizeText( $text, $lang ); + + $dbw->update( + $this->table->getTableName(), + array( + 'term_search_key' => $key, + ), + array( + 'term_row_id' => $rowId, + ), + __METHOD__ + ); + + return $key; + } + + /** + * reports a message + * + * @since 0.4 + * + * @param $msg + */ + protected function report( $msg ) { + if ( $this->reporter ) { + $this->reporter->reportMessage( $msg ); + } + } + +} \ No newline at end of file diff --git a/repo/includes/store/sql/TermSqlCache.php b/repo/includes/store/sql/TermSqlCache.php index a43376b..faecbda 100644 --- a/repo/includes/store/sql/TermSqlCache.php +++ b/repo/includes/store/sql/TermSqlCache.php @@ -75,6 +75,18 @@ } /** + * Returns the name of the database table used to store the terms. + * This is the logical table name, subject to prefixing by the Database object. + * + * @since 0.4 + * + * @return string + */ + public function getTableName() { + return $this->tableName; + } + + /** * @see TermCache::saveTermsOfEntity * * @since 0.1 @@ -326,14 +338,25 @@ } /** - * Returns the Database from which to read. + * Returns the Database connection from which to read. * * @since 0.1 * * @return \DatabaseBase */ - protected function getReadDb() { - return wfGetDB( $this->readDb ); + public function getReadDb() { + return wfGetDB( $this->readDb ); // TODO: allow foreign db + } + + /** + * Returns the Database connection to wich to write. + * + * @since 0.4 + * + * @return \DatabaseBase + */ + public function getWriteDb() { + return wfGetDB( DB_MASTER ); // TODO: allow foreign db } /** @@ -790,112 +813,4 @@ return $resultTerms; } - - /** - * Rebuild the search key field term_search_key from the source term_text field. - * - * FIXME: for some unknown reason some rows are skipped in the rebuild - * - * @since 0.2 - */ - public function rebuildSearchKey() { - $dbw = wfGetDB( DB_MASTER ); - - $entityId = -1; - $entityType = ''; - $language = ''; - $type = ''; - $text = ''; - - $limit = 10; - - $hasMoreResults = true; - - while ( $hasMoreResults ) { - $terms = $dbw->select( - $this->tableName, - array( - 'term_entity_id', - 'term_entity_type', - 'term_language', - 'term_type', - 'term_text', - ), - array( - 'term_entity_id >= ' . $dbw->addQuotes( $entityId ), - 'term_entity_type >= ' . $dbw->addQuotes( $entityType ), - 'term_language >= ' . $dbw->addQuotes( $language ), - 'term_type >= ' . $dbw->addQuotes( $type ), - 'term_text >= ' . $dbw->addQuotes( $text ), - ), - __METHOD__, - array( - 'LIMIT' => $limit, - 'ORDER BY term_entity_id, term_entity_type, term_language, term_type, term_text ASC, ASC, ASC, ASC, ASC' - ) - ); - - $continuationTerm = $this->rebuildSearchKeyForTerms( $terms, $dbw, $limit ); - - if ( $continuationTerm === null ) { - $hasMoreResults = false; - } - else { - $entityId = $continuationTerm->term_entity_id; - $entityType = $continuationTerm->term_entity_type; - $language = $continuationTerm->term_language; - $type = $continuationTerm->term_type; - $text = $continuationTerm->term_text; - } - } - } - - /** - * @since 0.2 - * - * @param Iterator $terms - * @param DatabaseBase $dbw - * @param integer $limit - * - * @return object|null The continuation term if there is one or null - */ - protected function rebuildSearchKeyForTerms( Iterator $terms, DatabaseBase $dbw, $limit ) { - $termNumber = 0; - - $doTrx = $dbw->trxLevel() === 0; - - if ( $doTrx ) { - $dbw->begin(); - } - - $hasMoreResults = false; - - foreach ( $terms as $term ) { - $dbw->update( - $this->tableName, - array( - 'term_search_key' => Term::normalizeText( $term->term_text, $term->term_language ) - ), - array( - 'term_entity_id' => $term->term_entity_id, - 'term_entity_type' => $term->term_entity_type, - 'term_language' => $term->term_language, - 'term_type' => $term->term_type, - 'term_text' => $term->term_text, - ), - __METHOD__ - ); - - if ( ++$termNumber >= $limit ) { - $hasMoreResults = true; - } - } - - if ( $doTrx ) { - $dbw->commit(); - } - - return $hasMoreResults ? $term : null; - } - } diff --git a/repo/maintenance/rebuildTermsSearchKey.php b/repo/maintenance/rebuildTermsSearchKey.php index 48e06cf..30636fe 100644 --- a/repo/maintenance/rebuildTermsSearchKey.php +++ b/repo/maintenance/rebuildTermsSearchKey.php @@ -36,9 +36,13 @@ class RebuildTermsSearchKey extends LoggedUpdateMaintenance { public function __construct() { + parent::__construct(); + $this->mDescription = 'Rebuild the search key of the TermSQLCache'; - parent::__construct(); + $this->addOption( 'only-missing', "Update only missing keys (per default, all keys are updated)" ); + $this->addOption( 'start-row', "The ID of the first row to update (useful for continuing aborted runs)", false, true ); + $this->addOption( 'batch-size', "Number of rows to update per database transaction (100 per default)", false, true ); } /** @@ -52,7 +56,23 @@ exit; } - StoreFactory::getStore( 'sqlstore' )->newTermCache()->rebuildSearchKey(); + $reporter = new \ObservableMessageReporter(); + $reporter->registerReporterCallback( + array( $this, 'report' ) + ); + + $table = StoreFactory::getStore( 'sqlstore' )->newTermCache(); + $builder = new TermSearchKeyBuilder( $table ); + $builder->setReporter( $reporter ); + + $builder->setBatchSize( intval( $this->getOption( 'batch-size', 100 ) ) ); + $builder->setRebuildAll( !$this->getOption( 'only-missing', false ) ); + $builder->setFromId( intval( $this->getOption( 'start-row', 1 ) ) ); + + $n = $builder->rebuildSearchKey(); + + $this->output( "Done. Updated $n search keys.\n" ); + return true; } @@ -65,6 +85,17 @@ return 'Wikibase\RebuildTermsSearchKey'; } + /** + * Outputs a message vis the output() method. + * + * @since 0.4 + * + * @param $msg + */ + public function report( $msg ) { + $this->output( "$msg\n" ); + } + } $maintClass = 'Wikibase\RebuildTermsSearchKey'; -- To view, visit https://gerrit.wikimedia.org/r/51847 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I4d2b9fcaa9d848a96540e51fdc385355e0e466d0 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/Wikibase Gerrit-Branch: master Gerrit-Owner: Daniel Kinzler <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
