Aude has submitted this change and it was merged.
Change subject: Rewrite of rebuildTermSearchKey
......................................................................
Rewrite of rebuildTermSearchKey
* added command line control over batch size, start row, etc
* use row id for batching
* added progress reporting
Change-Id: I4d2b9fcaa9d848a96540e51fdc385355e0e466d0
---
M repo/Wikibase.hooks.php
M repo/Wikibase.php
A repo/includes/store/sql/TermSearchKeyBuilder.php
M repo/includes/store/sql/TermSqlCache.php
M repo/maintenance/rebuildTermsSearchKey.php
A repo/tests/phpunit/includes/store/sql/TermSearchKeyBuilderTest.php
M repo/tests/phpunit/includes/store/sql/TermSqlCacheTest.php
7 files changed, 412 insertions(+), 120 deletions(-)
Approvals:
Aude: Verified; Looks good to me, approved
jenkins-bot: Verified
diff --git a/repo/Wikibase.hooks.php b/repo/Wikibase.hooks.php
index 8040b26..44a3c62 100644
--- a/repo/Wikibase.hooks.php
+++ b/repo/Wikibase.hooks.php
@@ -202,6 +202,7 @@
'store/sql/SqlIdGenerator',
'store/sql/TermSqlCache',
+ 'store/sql/TermSearchKeyBuilder',
'updates/ItemDeletionUpdate',
'updates/ItemModificationUpdate',
diff --git a/repo/Wikibase.php b/repo/Wikibase.php
index 56187af..0237e6a 100644
--- a/repo/Wikibase.php
+++ b/repo/Wikibase.php
@@ -182,6 +182,7 @@
$wgAutoloadClasses['Wikibase\TermCache'] = $dir
. 'includes/store/TermCache.php';
$wgAutoloadClasses['Wikibase\TermCombinationMatchFinder'] = $dir .
'includes/store/TermCombinationMatchFinder.php';
$wgAutoloadClasses['Wikibase\TermMatchScoreCalculator'] = $dir .
'includes/store/TermMatchScoreCalculator.php';
+$wgAutoloadClasses['Wikibase\TermSearchKeyBuilder'] = $dir .
'includes/store/sql/TermSearchKeyBuilder.php';
// includes/store/sql
$wgAutoloadClasses['Wikibase\SqlIdGenerator'] = $dir .
'includes/store/sql/SqlIdGenerator.php';
diff --git a/repo/includes/store/sql/TermSearchKeyBuilder.php
b/repo/includes/store/sql/TermSearchKeyBuilder.php
new file mode 100644
index 0000000..f6e6248
--- /dev/null
+++ b/repo/includes/store/sql/TermSearchKeyBuilder.php
@@ -0,0 +1,238 @@
+<?php
+
+namespace Wikibase;
+use Iterator, DatabaseBase;
+
+/**
+ * Utility class for rebuilding the term_search_key field.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @since 0.4
+ *
+ * @file
+ * @ingroup WikibaseRepo
+ *
+ * @licence GNU GPL v2+
+ * @author Jeroen De Dauw < [email protected] >
+ * @author Jens Ohlig < [email protected] >
+ * @author Daniel Kinzler
+ */
+class TermSearchKeyBuilder {
+
+ /**
+ * @since 0.4
+ *
+ * @var TermSqlCache $table
+ */
+ protected $table;
+
+ /**
+ * @since 0.4
+ *
+ * @var MessageReporter $reporter
+ */
+ protected $reporter;
+
+ /**
+ * Whether all keys should be updated, or only missing keys
+ *
+ * @var bool
+ */
+ protected $all = true;
+
+ /**
+ * Whether all keys should be updated, or only missing keys
+ *
+ * @var bool
+ */
+ protected $fromId = 1;
+
+ /**
+ * The batch size, giving the number of rows to be updated in each
database transaction.
+ *
+ * @var int
+ */
+ protected $batchSize = 100;
+
+ /**
+ * Constructor.
+ *
+ * @since 0.4
+ *
+ * @param TermSqlCache $table
+ */
+ public function __construct( TermSqlCache $table ) {
+ $this->table = $table;
+ }
+
+ /**
+ * @return boolean
+ */
+ public function getRebuildAll() {
+ return $this->all;
+ }
+
+ /**
+ * @return int
+ */
+ public function getBatchSize() {
+ return $this->batchSize;
+ }
+
+ /**
+ * @return boolean
+ */
+ public function getFromId() {
+ return $this->fromId;
+ }
+
+ /**
+ * @param boolean $all
+ */
+ public function setRebuildAll( $all ) {
+ $this->all = $all;
+ }
+
+ /**
+ * @param int $batchSize
+ */
+ public function setBatchSize( $batchSize ) {
+ $this->batchSize = $batchSize;
+ }
+
+ /**
+ * @param boolean $fromId
+ */
+ public function setFromId( $fromId ) {
+ $this->fromId = $fromId;
+ }
+
+ /**
+ * Sets the reporter to use for reporting preogress.
+ *
+ * @param \MessageReporter $reporter
+ */
+ public function setReporter( \MessageReporter $reporter ) {
+ $this->reporter = $reporter;
+ }
+
+ /**
+ * Rebuild the search key field term_search_key from the source
term_text field.
+ * Use the rebuildSearchKey.php maintenance script to invoke this from
the command line.
+ *
+ * Database updates a batched into multiple transactions. Do not call
this
+ * method whithin an (explicite) database transaction.
+ *
+ * @since 0.4
+ */
+ public function rebuildSearchKey() {
+ $dbw = $this->table->getWriteDb();
+
+ $rowId = $this->fromId -1;
+
+ $total = 0;
+
+ while ( true ) {
+ $dbw->begin();
+
+ $terms = $dbw->select(
+ $this->table->getTableName(),
+ array(
+ 'term_row_id',
+ 'term_language',
+ 'term_text',
+ ),
+ array(
+ 'term_row_id > ' . (int) $rowId,
+ $this->all ? '1' : 'term_search_key =
\'\'', // if not $all, only set missing keys
+ ),
+ __METHOD__,
+ array(
+ 'LIMIT' => $this->batchSize,
+ 'ORDER BY term_row_id ASC',
+ 'FOR UPDATE'
+ )
+ );
+
+ $c = 0;
+
+ foreach ( $terms as $row ) {
+ $this->updateSearchKey( $dbw,
$row->term_row_id, $row->term_text, $row->term_language );
+ $rowId = $row->term_row_id;
+ $c+= 1;
+ }
+
+ $dbw->commit();
+
+ $this->report( "Updated $c search keys, up to row
$rowId." );
+ $total += $c;
+
+ if ( $c < $this->batchSize ) {
+ // we are done.
+ break;
+ }
+ }
+
+ return $total;
+ }
+
+ /**
+ * Updates a single row with a newley calculated search key.
+ * The search key is calculated using Term::normalizeText().
+ *
+ * @see Term::normalizeText
+ *
+ * @since 0.4
+ *
+ * @param \DatabaseBase $dbw the database connection to use
+ * @param int $rowId the row to update
+ * @param string $text the term's text
+ * @param string $lang the term's language
+ *
+ * @return string the search key
+ */
+ protected function updateSearchKey( \DatabaseBase $dbw, $rowId, $text,
$lang ) {
+ $key = Term::normalizeText( $text, $lang );
+
+ $dbw->update(
+ $this->table->getTableName(),
+ array(
+ 'term_search_key' => $key,
+ ),
+ array(
+ 'term_row_id' => $rowId,
+ ),
+ __METHOD__
+ );
+
+ return $key;
+ }
+
+ /**
+ * reports a message
+ *
+ * @since 0.4
+ *
+ * @param $msg
+ */
+ protected function report( $msg ) {
+ if ( $this->reporter ) {
+ $this->reporter->reportMessage( $msg );
+ }
+ }
+
+}
\ No newline at end of file
diff --git a/repo/includes/store/sql/TermSqlCache.php
b/repo/includes/store/sql/TermSqlCache.php
index 7da45c4..8fec0be 100644
--- a/repo/includes/store/sql/TermSqlCache.php
+++ b/repo/includes/store/sql/TermSqlCache.php
@@ -75,6 +75,18 @@
}
/**
+ * Returns the name of the database table used to store the terms.
+ * This is the logical table name, subject to prefixing by the Database
object.
+ *
+ * @since 0.4
+ *
+ * @return string
+ */
+ public function getTableName() {
+ return $this->tableName;
+ }
+
+ /**
* @see TermCache::saveTermsOfEntity
*
* @since 0.1
@@ -326,14 +338,25 @@
}
/**
- * Returns the Database from which to read.
+ * Returns the Database connection from which to read.
*
* @since 0.1
*
* @return \DatabaseBase
*/
- protected function getReadDb() {
- return wfGetDB( $this->readDb );
+ public function getReadDb() {
+ return wfGetDB( $this->readDb ); // TODO: allow foreign db
+ }
+
+ /**
+ * Returns the Database connection to wich to write.
+ *
+ * @since 0.4
+ *
+ * @return \DatabaseBase
+ */
+ public function getWriteDb() {
+ return wfGetDB( DB_MASTER ); // TODO: allow foreign db
}
/**
@@ -790,112 +813,4 @@
return $resultTerms;
}
-
- /**
- * Rebuild the search key field term_search_key from the source
term_text field.
- *
- * FIXME: for some unknown reason some rows are skipped in the rebuild
- *
- * @since 0.2
- */
- public function rebuildSearchKey() {
- $dbw = wfGetDB( DB_MASTER );
-
- $entityId = -1;
- $entityType = '';
- $language = '';
- $type = '';
- $text = '';
-
- $limit = 10;
-
- $hasMoreResults = true;
-
- while ( $hasMoreResults ) {
- $terms = $dbw->select(
- $this->tableName,
- array(
- 'term_entity_id',
- 'term_entity_type',
- 'term_language',
- 'term_type',
- 'term_text',
- ),
- array(
- 'term_entity_id >= ' . $dbw->addQuotes(
$entityId ),
- 'term_entity_type >= ' .
$dbw->addQuotes( $entityType ),
- 'term_language >= ' . $dbw->addQuotes(
$language ),
- 'term_type >= ' . $dbw->addQuotes(
$type ),
- 'term_text >= ' . $dbw->addQuotes(
$text ),
- ),
- __METHOD__,
- array(
- 'LIMIT' => $limit,
- 'ORDER BY term_entity_id,
term_entity_type, term_language, term_type, term_text ASC, ASC, ASC, ASC, ASC'
- )
- );
-
- $continuationTerm = $this->rebuildSearchKeyForTerms(
$terms, $dbw, $limit );
-
- if ( $continuationTerm === null ) {
- $hasMoreResults = false;
- }
- else {
- $entityId = $continuationTerm->term_entity_id;
- $entityType =
$continuationTerm->term_entity_type;
- $language = $continuationTerm->term_language;
- $type = $continuationTerm->term_type;
- $text = $continuationTerm->term_text;
- }
- }
- }
-
- /**
- * @since 0.2
- *
- * @param Iterator $terms
- * @param DatabaseBase $dbw
- * @param integer $limit
- *
- * @return object|null The continuation term if there is one or null
- */
- protected function rebuildSearchKeyForTerms( Iterator $terms,
DatabaseBase $dbw, $limit ) {
- $termNumber = 0;
-
- $doTrx = $dbw->trxLevel() === 0;
-
- if ( $doTrx ) {
- $dbw->begin();
- }
-
- $hasMoreResults = false;
-
- foreach ( $terms as $term ) {
- $dbw->update(
- $this->tableName,
- array(
- 'term_search_key' =>
Term::normalizeText( $term->term_text, $term->term_language )
- ),
- array(
- 'term_entity_id' =>
$term->term_entity_id,
- 'term_entity_type' =>
$term->term_entity_type,
- 'term_language' => $term->term_language,
- 'term_type' => $term->term_type,
- 'term_text' => $term->term_text,
- ),
- __METHOD__
- );
-
- if ( ++$termNumber >= $limit ) {
- $hasMoreResults = true;
- }
- }
-
- if ( $doTrx ) {
- $dbw->commit();
- }
-
- return $hasMoreResults ? $term : null;
- }
-
}
diff --git a/repo/maintenance/rebuildTermsSearchKey.php
b/repo/maintenance/rebuildTermsSearchKey.php
index 48e06cf..30636fe 100644
--- a/repo/maintenance/rebuildTermsSearchKey.php
+++ b/repo/maintenance/rebuildTermsSearchKey.php
@@ -36,9 +36,13 @@
class RebuildTermsSearchKey extends LoggedUpdateMaintenance {
public function __construct() {
+ parent::__construct();
+
$this->mDescription = 'Rebuild the search key of the
TermSQLCache';
- parent::__construct();
+ $this->addOption( 'only-missing', "Update only missing keys
(per default, all keys are updated)" );
+ $this->addOption( 'start-row', "The ID of the first row to
update (useful for continuing aborted runs)", false, true );
+ $this->addOption( 'batch-size', "Number of rows to update per
database transaction (100 per default)", false, true );
}
/**
@@ -52,7 +56,23 @@
exit;
}
- StoreFactory::getStore( 'sqlstore'
)->newTermCache()->rebuildSearchKey();
+ $reporter = new \ObservableMessageReporter();
+ $reporter->registerReporterCallback(
+ array( $this, 'report' )
+ );
+
+ $table = StoreFactory::getStore( 'sqlstore' )->newTermCache();
+ $builder = new TermSearchKeyBuilder( $table );
+ $builder->setReporter( $reporter );
+
+ $builder->setBatchSize( intval( $this->getOption( 'batch-size',
100 ) ) );
+ $builder->setRebuildAll( !$this->getOption( 'only-missing',
false ) );
+ $builder->setFromId( intval( $this->getOption( 'start-row', 1 )
) );
+
+ $n = $builder->rebuildSearchKey();
+
+ $this->output( "Done. Updated $n search keys.\n" );
+
return true;
}
@@ -65,6 +85,17 @@
return 'Wikibase\RebuildTermsSearchKey';
}
+ /**
+ * Outputs a message vis the output() method.
+ *
+ * @since 0.4
+ *
+ * @param $msg
+ */
+ public function report( $msg ) {
+ $this->output( "$msg\n" );
+ }
+
}
$maintClass = 'Wikibase\RebuildTermsSearchKey';
diff --git a/repo/tests/phpunit/includes/store/sql/TermSearchKeyBuilderTest.php
b/repo/tests/phpunit/includes/store/sql/TermSearchKeyBuilderTest.php
new file mode 100644
index 0000000..53fb08a
--- /dev/null
+++ b/repo/tests/phpunit/includes/store/sql/TermSearchKeyBuilderTest.php
@@ -0,0 +1,110 @@
+<?php
+
+namespace Wikibase\Test;
+use Wikibase\TermSqlCache;
+use Wikibase\Term;
+
+/**
+ * Tests for the Wikibase\TermSqlCache class.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @file
+ * @since 0.2
+ *
+ * @ingroup WikibaseRepoTest
+ * @ingroup Test
+ *
+ * @group Wikibase
+ * @group WikibaseStore
+ * @group WikibaseTerm
+ * @group Database
+ *
+ * Some of the tests takes more time, and needs therefor longer time before
they can be aborted
+ * as non-functional. The reason why tests are aborted is assumed to be set up
of temporal databases
+ * that hold the first tests in a pending state awaiting access to the
database.
+ * @group medium
+ *
+ * @licence GNU GPL v2+
+ * @author Jeroen De Dauw < [email protected] >
+ */
+class TermSearchKeyBuilderTest extends \MediaWikiTestCase {
+
+ public function termProvider() {
+ $argLists = array();
+
+ $argLists[] = array( 'en', 'FoO', 'fOo', true );
+ $argLists[] = array( 'ru', 'Берлин', ' берлин ', true );
+
+ $argLists[] = array( 'en', 'FoO', 'bar', false );
+ $argLists[] = array( 'ru', 'Берлин', 'бе55585рлин', false );
+
+ return $argLists;
+ }
+
+ /**
+ * @dataProvider termProvider
+ * @param $languageCode
+ * @param $termText
+ * @param $searchText
+ * @param boolean $matches
+ */
+ public function testRebuildSearchKey( $languageCode, $termText,
$searchText, $matches ) {
+ if ( \Wikibase\Settings::get( 'withoutTermSearchKey' ) ) {
+ $this->markTestSkipped( "can't test search key if
withoutTermSearchKey option is set." );
+ }
+
+ // make term in item
+ $item = \Wikibase\Item::newEmpty();
+ $item->setId( 42 );
+ $item->setLabel( $languageCode, $termText );
+
+ // save term
+ /* @var TermSqlCache $termCache */
+ $termCache = \Wikibase\StoreFactory::getStore( 'sqlstore'
)->newTermCache();
+ $termCache->clear();
+ $termCache->saveTermsOfEntity( $item );
+
+ // remove search key
+ $dbw = wfGetDB( DB_MASTER );
+ $dbw->update( $termCache->getTableName(), array(
'term_search_key' => '' ), array(), __METHOD__ );
+
+ // rebuild search key
+ $builder = new \Wikibase\TermSearchKeyBuilder( $termCache );
+ $builder->setRebuildAll( true );
+ $builder->rebuildSearchKey();
+
+ // remove search key
+ $term = new Term();
+ $term->setLanguage( $languageCode );
+ $term->setText( $searchText );
+
+ $options = array(
+ 'caseSensitive' => false,
+ );
+
+ $obtainedTerms = $termCache->getMatchingTerms( array( $term ),
Term::TYPE_LABEL, \Wikibase\Item::ENTITY_TYPE, $options );
+
+ $this->assertEquals( $matches ? 1 : 0, count( $obtainedTerms )
);
+
+ if ( $matches ) {
+ $obtainedTerm = array_shift( $obtainedTerms );
+
+ $this->assertEquals( $termText,
$obtainedTerm->getText() );
+ }
+ }
+
+}
diff --git a/repo/tests/phpunit/includes/store/sql/TermSqlCacheTest.php
b/repo/tests/phpunit/includes/store/sql/TermSqlCacheTest.php
index 5953a80..256a6a4 100644
--- a/repo/tests/phpunit/includes/store/sql/TermSqlCacheTest.php
+++ b/repo/tests/phpunit/includes/store/sql/TermSqlCacheTest.php
@@ -30,8 +30,8 @@
*
* @group Wikibase
* @group WikibaseStore
+ * @group WikibaseTerm
* @group Database
- * @group TermSqlCacheTest
*
* Some of the tests takes more time, and needs therefor longer time before
they can be aborted
* as non-functional. The reason why tests are aborted is assumed to be set up
of temporal databases
@@ -47,9 +47,7 @@
$argLists = array();
$argLists[] = array( 'en', 'FoO', 'fOo', true );
-
- // TODO: enable once we fix our normalization and lowercasing
to work with cyrillic
- //$argLists[] = array( 'ru', 'Берлин', 'берлин', true );
+ $argLists[] = array( 'ru', 'Берлин', 'берлин', true );
$argLists[] = array( 'en', 'FoO', 'bar', false );
$argLists[] = array( 'ru', 'Берлин', 'бе55585рлин', false );
@@ -64,7 +62,7 @@
* @param $searchText
* @param boolean $matches
*/
- public function testRebuildSearchKey( $languageCode, $termText,
$searchText, $matches ) {
+ public function testGetMatchingTerms( $languageCode, $termText,
$searchText, $matches ) {
if ( \Wikibase\Settings::get( 'withoutTermSearchKey' ) ) {
$this->markTestSkipped( "can't test search key if
withoutTermSearchKey option is set." );
}
@@ -82,8 +80,6 @@
$item->setLabel( $languageCode, $termText );
$termCache->saveTermsOfEntity( $item );
-
- $termCache->rebuildSearchKey();
$term = new Term();
$term->setLanguage( $languageCode );
--
To view, visit https://gerrit.wikimedia.org/r/51847
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I4d2b9fcaa9d848a96540e51fdc385355e0e466d0
Gerrit-PatchSet: 5
Gerrit-Project: mediawiki/extensions/Wikibase
Gerrit-Branch: master
Gerrit-Owner: Daniel Kinzler <[email protected]>
Gerrit-Reviewer: Aude <[email protected]>
Gerrit-Reviewer: Daniel Kinzler <[email protected]>
Gerrit-Reviewer: Jens Ohlig <[email protected]>
Gerrit-Reviewer: Jeroen De Dauw <[email protected]>
Gerrit-Reviewer: John Erling Blad <[email protected]>
Gerrit-Reviewer: jenkins-bot
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits