Aude has submitted this change and it was merged.

Change subject: Rewrite of rebuildTermSearchKey
......................................................................


Rewrite of rebuildTermSearchKey

* added command line control over batch size, start row, etc
* use row id for batching
* added progress reporting

Change-Id: I4d2b9fcaa9d848a96540e51fdc385355e0e466d0
---
M repo/Wikibase.hooks.php
M repo/Wikibase.php
A repo/includes/store/sql/TermSearchKeyBuilder.php
M repo/includes/store/sql/TermSqlCache.php
M repo/maintenance/rebuildTermsSearchKey.php
A repo/tests/phpunit/includes/store/sql/TermSearchKeyBuilderTest.php
M repo/tests/phpunit/includes/store/sql/TermSqlCacheTest.php
7 files changed, 412 insertions(+), 120 deletions(-)

Approvals:
  Aude: Verified; Looks good to me, approved
  jenkins-bot: Verified



diff --git a/repo/Wikibase.hooks.php b/repo/Wikibase.hooks.php
index 8040b26..44a3c62 100644
--- a/repo/Wikibase.hooks.php
+++ b/repo/Wikibase.hooks.php
@@ -202,6 +202,7 @@
 
                        'store/sql/SqlIdGenerator',
                        'store/sql/TermSqlCache',
+                       'store/sql/TermSearchKeyBuilder',
 
                        'updates/ItemDeletionUpdate',
                        'updates/ItemModificationUpdate',
diff --git a/repo/Wikibase.php b/repo/Wikibase.php
index 56187af..0237e6a 100644
--- a/repo/Wikibase.php
+++ b/repo/Wikibase.php
@@ -182,6 +182,7 @@
 $wgAutoloadClasses['Wikibase\TermCache']                               = $dir 
. 'includes/store/TermCache.php';
 $wgAutoloadClasses['Wikibase\TermCombinationMatchFinder'] = $dir . 
'includes/store/TermCombinationMatchFinder.php';
 $wgAutoloadClasses['Wikibase\TermMatchScoreCalculator'] = $dir . 
'includes/store/TermMatchScoreCalculator.php';
+$wgAutoloadClasses['Wikibase\TermSearchKeyBuilder']     = $dir . 
'includes/store/sql/TermSearchKeyBuilder.php';
 
 // includes/store/sql
 $wgAutoloadClasses['Wikibase\SqlIdGenerator']                  = $dir . 
'includes/store/sql/SqlIdGenerator.php';
diff --git a/repo/includes/store/sql/TermSearchKeyBuilder.php 
b/repo/includes/store/sql/TermSearchKeyBuilder.php
new file mode 100644
index 0000000..f6e6248
--- /dev/null
+++ b/repo/includes/store/sql/TermSearchKeyBuilder.php
@@ -0,0 +1,238 @@
+<?php
+
+namespace Wikibase;
+use Iterator, DatabaseBase;
+
+/**
+ * Utility class for rebuilding the term_search_key field.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @since 0.4
+ *
+ * @file
+ * @ingroup WikibaseRepo
+ *
+ * @licence GNU GPL v2+
+ * @author Jeroen De Dauw < [email protected] >
+ * @author Jens Ohlig < [email protected] >
+ * @author Daniel Kinzler
+ */
+class TermSearchKeyBuilder {
+
+       /**
+        * @since 0.4
+        *
+        * @var TermSqlCache $table
+        */
+       protected $table;
+
+       /**
+        * @since 0.4
+        *
+        * @var MessageReporter $reporter
+        */
+       protected $reporter;
+
+       /**
+        * Whether all keys should be updated, or only missing keys
+        *
+        * @var bool
+        */
+       protected $all = true;
+
+       /**
+        * Whether all keys should be updated, or only missing keys
+        *
+        * @var bool
+        */
+       protected $fromId = 1;
+
+       /**
+        * The batch size, giving the number of rows to be updated in each 
database transaction.
+        *
+        * @var int
+        */
+       protected $batchSize = 100;
+
+       /**
+        * Constructor.
+        *
+        * @since 0.4
+        *
+        * @param TermSqlCache $table
+        */
+       public function __construct( TermSqlCache $table ) {
+               $this->table = $table;
+       }
+
+       /**
+        * @return boolean
+        */
+       public function getRebuildAll() {
+               return $this->all;
+       }
+
+       /**
+        * @return int
+        */
+       public function getBatchSize() {
+               return $this->batchSize;
+       }
+
+       /**
+        * @return boolean
+        */
+       public function getFromId() {
+               return $this->fromId;
+       }
+
+       /**
+        * @param boolean $all
+        */
+       public function setRebuildAll( $all ) {
+               $this->all = $all;
+       }
+
+       /**
+        * @param int $batchSize
+        */
+       public function setBatchSize( $batchSize ) {
+               $this->batchSize = $batchSize;
+       }
+
+       /**
+        * @param boolean $fromId
+        */
+       public function setFromId( $fromId ) {
+               $this->fromId = $fromId;
+       }
+
+       /**
+        * Sets the reporter to use for reporting preogress.
+        *
+        * @param \MessageReporter $reporter
+        */
+       public function setReporter( \MessageReporter $reporter ) {
+               $this->reporter = $reporter;
+       }
+
+       /**
+        * Rebuild the search key field term_search_key from the source 
term_text field.
+        * Use the rebuildSearchKey.php maintenance script to invoke this from 
the command line.
+        *
+        * Database updates a batched into multiple transactions. Do not call 
this
+        * method whithin an (explicite) database transaction.
+        *
+        * @since 0.4
+        */
+       public function rebuildSearchKey() {
+               $dbw = $this->table->getWriteDb();
+
+               $rowId = $this->fromId -1;
+
+               $total = 0;
+
+               while ( true ) {
+                       $dbw->begin();
+
+                       $terms = $dbw->select(
+                               $this->table->getTableName(),
+                               array(
+                                       'term_row_id',
+                                       'term_language',
+                                       'term_text',
+                               ),
+                               array(
+                                       'term_row_id > ' . (int) $rowId,
+                                       $this->all ? '1' : 'term_search_key = 
\'\'', // if not $all, only set missing keys
+                               ),
+                               __METHOD__,
+                               array(
+                                       'LIMIT' => $this->batchSize,
+                                       'ORDER BY term_row_id ASC',
+                                       'FOR UPDATE'
+                               )
+                       );
+
+                       $c = 0;
+
+                       foreach ( $terms as $row ) {
+                               $this->updateSearchKey( $dbw, 
$row->term_row_id, $row->term_text, $row->term_language );
+                               $rowId = $row->term_row_id;
+                               $c+= 1;
+                       }
+
+                       $dbw->commit();
+
+                       $this->report( "Updated $c search keys, up to row 
$rowId." );
+                       $total += $c;
+
+                       if ( $c < $this->batchSize ) {
+                               // we are done.
+                               break;
+                       }
+               }
+
+               return $total;
+       }
+
+       /**
+        * Updates a single row with a newley calculated search key.
+        * The search key is calculated using Term::normalizeText().
+        *
+        * @see Term::normalizeText
+        *
+        * @since 0.4
+        *
+        * @param \DatabaseBase $dbw the database connection to use
+        * @param int $rowId the row to update
+        * @param string $text the term's text
+        * @param string $lang the term's language
+        *
+        * @return string the search key
+        */
+       protected function updateSearchKey( \DatabaseBase $dbw, $rowId, $text, 
$lang ) {
+               $key = Term::normalizeText( $text, $lang );
+
+               $dbw->update(
+                       $this->table->getTableName(),
+                       array(
+                               'term_search_key' => $key,
+                       ),
+                       array(
+                               'term_row_id' => $rowId,
+                       ),
+                       __METHOD__
+               );
+
+               return $key;
+       }
+
+       /**
+        * reports a message
+        *
+        * @since 0.4
+        *
+        * @param $msg
+        */
+       protected function report( $msg ) {
+               if ( $this->reporter ) {
+                       $this->reporter->reportMessage( $msg );
+               }
+       }
+
+}
\ No newline at end of file
diff --git a/repo/includes/store/sql/TermSqlCache.php 
b/repo/includes/store/sql/TermSqlCache.php
index 7da45c4..8fec0be 100644
--- a/repo/includes/store/sql/TermSqlCache.php
+++ b/repo/includes/store/sql/TermSqlCache.php
@@ -75,6 +75,18 @@
        }
 
        /**
+        * Returns the name of the database table used to store the terms.
+        * This is the logical table name, subject to prefixing by the Database 
object.
+        *
+        * @since 0.4
+        *
+        * @return string
+        */
+       public function getTableName() {
+               return $this->tableName;
+       }
+
+       /**
         * @see TermCache::saveTermsOfEntity
         *
         * @since 0.1
@@ -326,14 +338,25 @@
        }
 
        /**
-        * Returns the Database from which to read.
+        * Returns the Database connection from which to read.
         *
         * @since 0.1
         *
         * @return \DatabaseBase
         */
-       protected function getReadDb() {
-               return wfGetDB( $this->readDb );
+       public function getReadDb() {
+               return wfGetDB( $this->readDb ); // TODO: allow foreign db
+       }
+
+       /**
+        * Returns the Database connection to wich to write.
+        *
+        * @since 0.4
+        *
+        * @return \DatabaseBase
+        */
+       public function getWriteDb() {
+               return wfGetDB( DB_MASTER ); // TODO: allow foreign db
        }
 
        /**
@@ -790,112 +813,4 @@
 
                return $resultTerms;
        }
-
-       /**
-        * Rebuild the search key field term_search_key from the source 
term_text field.
-        *
-        * FIXME: for some unknown reason some rows are skipped in the rebuild
-        *
-        * @since 0.2
-        */
-       public function rebuildSearchKey() {
-               $dbw = wfGetDB( DB_MASTER );
-
-               $entityId = -1;
-               $entityType = '';
-               $language = '';
-               $type = '';
-               $text = '';
-
-               $limit = 10;
-
-               $hasMoreResults = true;
-
-               while ( $hasMoreResults ) {
-                       $terms = $dbw->select(
-                               $this->tableName,
-                               array(
-                                       'term_entity_id',
-                                       'term_entity_type',
-                                       'term_language',
-                                       'term_type',
-                                       'term_text',
-                               ),
-                               array(
-                                       'term_entity_id >= ' . $dbw->addQuotes( 
$entityId ),
-                                       'term_entity_type >= ' . 
$dbw->addQuotes( $entityType ),
-                                       'term_language >= ' . $dbw->addQuotes( 
$language ),
-                                       'term_type >= ' . $dbw->addQuotes( 
$type ),
-                                       'term_text >= ' . $dbw->addQuotes( 
$text ),
-                               ),
-                               __METHOD__,
-                               array(
-                                       'LIMIT' => $limit,
-                                       'ORDER BY term_entity_id, 
term_entity_type, term_language, term_type, term_text ASC, ASC, ASC, ASC, ASC'
-                               )
-                       );
-
-                       $continuationTerm = $this->rebuildSearchKeyForTerms( 
$terms, $dbw, $limit );
-
-                       if ( $continuationTerm === null ) {
-                               $hasMoreResults = false;
-                       }
-                       else {
-                               $entityId = $continuationTerm->term_entity_id;
-                               $entityType = 
$continuationTerm->term_entity_type;
-                               $language = $continuationTerm->term_language;
-                               $type = $continuationTerm->term_type;
-                               $text = $continuationTerm->term_text;
-                       }
-               }
-       }
-
-       /**
-        * @since 0.2
-        *
-        * @param Iterator $terms
-        * @param DatabaseBase $dbw
-        * @param integer $limit
-        *
-        * @return object|null The continuation term if there is one or null
-        */
-       protected function rebuildSearchKeyForTerms( Iterator $terms, 
DatabaseBase $dbw, $limit ) {
-               $termNumber = 0;
-
-               $doTrx = $dbw->trxLevel() === 0;
-
-               if ( $doTrx ) {
-                       $dbw->begin();
-               }
-
-               $hasMoreResults = false;
-
-               foreach ( $terms as $term ) {
-                       $dbw->update(
-                               $this->tableName,
-                               array(
-                                       'term_search_key' => 
Term::normalizeText( $term->term_text, $term->term_language )
-                               ),
-                               array(
-                                       'term_entity_id' => 
$term->term_entity_id,
-                                       'term_entity_type' => 
$term->term_entity_type,
-                                       'term_language' => $term->term_language,
-                                       'term_type' => $term->term_type,
-                                       'term_text' => $term->term_text,
-                               ),
-                               __METHOD__
-                       );
-
-                       if ( ++$termNumber >= $limit ) {
-                               $hasMoreResults = true;
-                       }
-               }
-
-               if ( $doTrx ) {
-                       $dbw->commit();
-               }
-
-               return $hasMoreResults ? $term : null;
-       }
-
 }
diff --git a/repo/maintenance/rebuildTermsSearchKey.php 
b/repo/maintenance/rebuildTermsSearchKey.php
index 48e06cf..30636fe 100644
--- a/repo/maintenance/rebuildTermsSearchKey.php
+++ b/repo/maintenance/rebuildTermsSearchKey.php
@@ -36,9 +36,13 @@
 class RebuildTermsSearchKey extends LoggedUpdateMaintenance {
 
        public function __construct() {
+               parent::__construct();
+
                $this->mDescription = 'Rebuild the search key of the 
TermSQLCache';
 
-               parent::__construct();
+               $this->addOption( 'only-missing', "Update only missing keys 
(per default, all keys are updated)" );
+               $this->addOption( 'start-row', "The ID of the first row to 
update (useful for continuing aborted runs)", false, true );
+               $this->addOption( 'batch-size', "Number of rows to update per 
database transaction (100 per default)", false, true );
        }
 
        /**
@@ -52,7 +56,23 @@
                        exit;
                }
 
-               StoreFactory::getStore( 'sqlstore' 
)->newTermCache()->rebuildSearchKey();
+               $reporter = new \ObservableMessageReporter();
+               $reporter->registerReporterCallback(
+                       array( $this, 'report' )
+               );
+
+               $table = StoreFactory::getStore( 'sqlstore' )->newTermCache();
+               $builder = new TermSearchKeyBuilder( $table );
+               $builder->setReporter( $reporter );
+
+               $builder->setBatchSize( intval( $this->getOption( 'batch-size', 
100 ) ) );
+               $builder->setRebuildAll( !$this->getOption( 'only-missing', 
false ) );
+               $builder->setFromId( intval( $this->getOption( 'start-row', 1 ) 
) );
+
+               $n = $builder->rebuildSearchKey();
+
+               $this->output( "Done. Updated $n search keys.\n" );
+
                return true;
        }
 
@@ -65,6 +85,17 @@
                return 'Wikibase\RebuildTermsSearchKey';
        }
 
+       /**
+        * Outputs a message vis the output() method.
+        *
+        * @since 0.4
+        *
+        * @param $msg
+        */
+       public function report( $msg ) {
+               $this->output( "$msg\n" );
+       }
+
 }
 
 $maintClass = 'Wikibase\RebuildTermsSearchKey';
diff --git a/repo/tests/phpunit/includes/store/sql/TermSearchKeyBuilderTest.php 
b/repo/tests/phpunit/includes/store/sql/TermSearchKeyBuilderTest.php
new file mode 100644
index 0000000..53fb08a
--- /dev/null
+++ b/repo/tests/phpunit/includes/store/sql/TermSearchKeyBuilderTest.php
@@ -0,0 +1,110 @@
+<?php
+
+namespace Wikibase\Test;
+use Wikibase\TermSqlCache;
+use Wikibase\Term;
+
+/**
+ * Tests for the Wikibase\TermSqlCache class.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @file
+ * @since 0.2
+ *
+ * @ingroup WikibaseRepoTest
+ * @ingroup Test
+ *
+ * @group Wikibase
+ * @group WikibaseStore
+ * @group WikibaseTerm
+ * @group Database
+ *
+ * Some of the tests takes more time, and needs therefor longer time before 
they can be aborted
+ * as non-functional. The reason why tests are aborted is assumed to be set up 
of temporal databases
+ * that hold the first tests in a pending state awaiting access to the 
database.
+ * @group medium
+ *
+ * @licence GNU GPL v2+
+ * @author Jeroen De Dauw < [email protected] >
+ */
+class TermSearchKeyBuilderTest extends \MediaWikiTestCase {
+
+       public function termProvider() {
+               $argLists = array();
+
+               $argLists[] = array( 'en', 'FoO', 'fOo', true );
+               $argLists[] = array( 'ru', 'Берлин', '  берлин  ', true );
+
+               $argLists[] = array( 'en', 'FoO', 'bar', false );
+               $argLists[] = array( 'ru', 'Берлин', 'бе55585рлин', false );
+
+               return $argLists;
+       }
+
+       /**
+        * @dataProvider termProvider
+        * @param $languageCode
+        * @param $termText
+        * @param $searchText
+        * @param boolean $matches
+        */
+       public function testRebuildSearchKey( $languageCode, $termText, 
$searchText, $matches ) {
+               if ( \Wikibase\Settings::get( 'withoutTermSearchKey' ) ) {
+                       $this->markTestSkipped( "can't test search key if 
withoutTermSearchKey option is set." );
+               }
+
+               // make term in item
+               $item = \Wikibase\Item::newEmpty();
+               $item->setId( 42 );
+               $item->setLabel( $languageCode, $termText );
+
+               // save term
+               /* @var TermSqlCache $termCache */
+               $termCache = \Wikibase\StoreFactory::getStore( 'sqlstore' 
)->newTermCache();
+               $termCache->clear();
+               $termCache->saveTermsOfEntity( $item );
+
+               // remove search key
+               $dbw = wfGetDB( DB_MASTER );
+               $dbw->update( $termCache->getTableName(), array( 
'term_search_key' => '' ), array(), __METHOD__ );
+
+               // rebuild search key
+               $builder = new \Wikibase\TermSearchKeyBuilder( $termCache );
+               $builder->setRebuildAll( true );
+               $builder->rebuildSearchKey();
+
+               // remove search key
+               $term = new Term();
+               $term->setLanguage( $languageCode );
+               $term->setText( $searchText );
+
+               $options = array(
+                       'caseSensitive' => false,
+               );
+
+               $obtainedTerms = $termCache->getMatchingTerms( array( $term ), 
Term::TYPE_LABEL, \Wikibase\Item::ENTITY_TYPE, $options );
+
+               $this->assertEquals( $matches ? 1 : 0, count( $obtainedTerms ) 
);
+
+               if ( $matches ) {
+                       $obtainedTerm = array_shift( $obtainedTerms );
+
+                       $this->assertEquals( $termText, 
$obtainedTerm->getText() );
+               }
+       }
+
+}
diff --git a/repo/tests/phpunit/includes/store/sql/TermSqlCacheTest.php 
b/repo/tests/phpunit/includes/store/sql/TermSqlCacheTest.php
index 5953a80..256a6a4 100644
--- a/repo/tests/phpunit/includes/store/sql/TermSqlCacheTest.php
+++ b/repo/tests/phpunit/includes/store/sql/TermSqlCacheTest.php
@@ -30,8 +30,8 @@
  *
  * @group Wikibase
  * @group WikibaseStore
+ * @group WikibaseTerm
  * @group Database
- * @group TermSqlCacheTest
  *
  * Some of the tests takes more time, and needs therefor longer time before 
they can be aborted
  * as non-functional. The reason why tests are aborted is assumed to be set up 
of temporal databases
@@ -47,9 +47,7 @@
                $argLists = array();
 
                $argLists[] = array( 'en', 'FoO', 'fOo', true );
-
-               // TODO: enable once we fix our normalization and lowercasing 
to work with cyrillic
-               //$argLists[] = array( 'ru', 'Берлин', 'берлин', true );
+               $argLists[] = array( 'ru', 'Берлин', 'берлин', true );
 
                $argLists[] = array( 'en', 'FoO', 'bar', false );
                $argLists[] = array( 'ru', 'Берлин', 'бе55585рлин', false );
@@ -64,7 +62,7 @@
         * @param $searchText
         * @param boolean $matches
         */
-       public function testRebuildSearchKey( $languageCode, $termText, 
$searchText, $matches ) {
+       public function testGetMatchingTerms( $languageCode, $termText, 
$searchText, $matches ) {
                if ( \Wikibase\Settings::get( 'withoutTermSearchKey' ) ) {
                        $this->markTestSkipped( "can't test search key if 
withoutTermSearchKey option is set." );
                }
@@ -82,8 +80,6 @@
                $item->setLabel( $languageCode, $termText );
 
                $termCache->saveTermsOfEntity( $item );
-
-               $termCache->rebuildSearchKey();
 
                $term = new Term();
                $term->setLanguage( $languageCode );

-- 
To view, visit https://gerrit.wikimedia.org/r/51847
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I4d2b9fcaa9d848a96540e51fdc385355e0e466d0
Gerrit-PatchSet: 5
Gerrit-Project: mediawiki/extensions/Wikibase
Gerrit-Branch: master
Gerrit-Owner: Daniel Kinzler <[email protected]>
Gerrit-Reviewer: Aude <[email protected]>
Gerrit-Reviewer: Daniel Kinzler <[email protected]>
Gerrit-Reviewer: Jens Ohlig <[email protected]>
Gerrit-Reviewer: Jeroen De Dauw <[email protected]>
Gerrit-Reviewer: John Erling Blad <[email protected]>
Gerrit-Reviewer: jenkins-bot

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to