jenkins-bot has submitted this change and it was merged. Change subject: (bug 43238) Add very simple weighting for entity search ......................................................................
(bug 43238) Add very simple weighting for entity search A very simple weighting algorithm (number of sitelinks) has been added to do a post-DB-query ranking of search results. Also, this limits the search to 5000 items. DEPLOYMENT NOTE: When deploying without a database update, set 'withoutTermWeight' => true. Change-Id: Ie09e4932de42676dd5d638d32f76abe0a2200617 --- M docs/options.wiki M lib/includes/store/sql/TermSqlIndex.php M repo/config/Wikibase.default.php M repo/tests/phpunit/includes/store/sql/TermSqlIndexTest.php 4 files changed, 159 insertions(+), 7 deletions(-) Approvals: Daniel Kinzler: Looks good to me, approved jenkins-bot: Verified diff --git a/docs/options.wiki b/docs/options.wiki index 1c6645c..af18355 100644 --- a/docs/options.wiki +++ b/docs/options.wiki @@ -50,6 +50,7 @@ ;defaultStore: The storage engine to use for storing entities. Default: 'sqlstore'. The is currently no alternative. ;idBlacklist: A list of IDs to reserve and skip for new entities. IDs are given as integers, the blacklist applies to all types of entities. '''Note:''' this may change in the future to allow separate blacklists for different kinds of entities. ;withoutTermSearchKey: Allow the terms table to work without the term_search_key field, for sites that can not easily roll out schema changes on large tables. This means that all searches will use exact matching (depending on the database's collation). Default: <code>false</code>. This is only needed for compatibility with old database layouts. +;withoutTermWeight: Allow the terms table to work without the term_weight field, for sites that can not easily roll out schema changes on large tables. This means that all searches will return the results in an undefined order (depending on how the database works). Default: <code>false</code>. This is only needed for compatibility with old database layouts. ;multilang-limits: Limits to impose on multilanguage strings like labels, descriptions and such. Supported limits: :;length: the maximum length of the string, in characters. :Default: <code>array( 'length' => 250 )</code> diff --git a/lib/includes/store/sql/TermSqlIndex.php b/lib/includes/store/sql/TermSqlIndex.php index f9c10d8..f77488f 100644 --- a/lib/includes/store/sql/TermSqlIndex.php +++ b/lib/includes/store/sql/TermSqlIndex.php @@ -158,15 +158,23 @@ __METHOD__ ); + $weightField = array(); + if ( $this->supportsWeight() ) { + $weightField = array( 'term_weight' => $this->getWeight( $entity ) ); + } + + /** * @var Term $term */ + foreach ( $entity->getTerms() as $term ) { $success = $dbw->insert( $this->tableName, array_merge( $this->getTermFields( $term ), - $entityIdentifiers + $entityIdentifiers, + $weightField ), __METHOD__ ); @@ -177,6 +185,27 @@ } return $success; + } + + /** + * Calculate a weight the given entity to be used for ranking. Should be normalized + * between 0 and 1, but that's not a strong constraint. + * This implementation relies on sitelinks, and simply takes the number of sitelinks + * as the weight. + * + * TODO Should be moved to its own object and be added via dependency injection + * + * @since 0.4 + * + * @param Entity $entity + * + * @return float weight + */ + protected function getWeight( Entity $entity ) { + if ( $entity instanceof Item ) { + return count( $entity->getSimpleSiteLinks() ) / 1000.0; + } + return 0.0; } /** @@ -539,7 +568,7 @@ * * @param array $terms * @param string $entityType - * @param array $options + * @param array $options There is an implicit LIMIT of 5000 items in this implementation * * @return EntityId[] */ @@ -548,16 +577,32 @@ return array(); } + // this is the maximum limit of search results TODO this should not be hardcoded + $internalLimit = 5000; + $conditions = $this->termsToConditions( $terms, null, $entityType, false, $options ); + + $dbr = $this->getReadDb(); $selectionFields = array( 'term_entity_id' ); - $dbr = $this->getReadDb(); + // TODO instead of a DB query, get a setting. Should save on a few Database round trips. + $hasWeight = $this->supportsWeight(); + + if ( $hasWeight ) { + $selectionFields[] = 'term_weight'; + } $queryOptions = array( 'DISTINCT' ); if ( array_key_exists( 'LIMIT', $options ) && $options['LIMIT'] ) { - $queryOptions['LIMIT'] = $options['LIMIT']; + if ( $hasWeight ) { + // if we take the weight into account, we need to grab basically all hits in order + // to allow for the post-search sorting below. + $queryOptions['LIMIT'] = max( $options['LIMIT'], $internalLimit ); + } else { + $queryOptions['LIMIT'] = $options['LIMIT']; + } } $obtainedIDs = $dbr->select( @@ -568,12 +613,38 @@ $queryOptions ); - $result = array(); - foreach ( $obtainedIDs as $obtainedID ) { - $result[] = new EntityId( $entityType, (int)$obtainedID->term_entity_id ); + if ( $hasWeight ) { + $weights = array(); + foreach ( $obtainedIDs as $obtainedID ) { + $weights[intval( $obtainedID->term_entity_id )] = floatval( $obtainedID->term_weight ); + } + + // this is a post-search sorting by weight. This allows us to not require an additional + // index on the wb_terms table that is very big already. This is also why we have + // the internal limit of 5000, since SQL's index would explode in size if we added the + // weight to it here (which would allow us to delegate the sorting to SQL itself) + arsort( $weights, SORT_NUMERIC ); + + if ( array_key_exists( 'LIMIT', $options ) && $options['LIMIT'] ) { + $ids = array_keys( array_slice( $weights, 0, $options['LIMIT'], true ) ); + } else { + $ids = array_keys( $weights ); + } + } else { + $ids = array(); + foreach ( $obtainedIDs as $obtainedID ) { + $ids[] = intval( $obtainedID->term_entity_id ); + } } $this->releaseConnection( $dbr ); + + // turn numbers into entity ids + $result = array(); + foreach ( $ids as $id ) { + $result[] = new EntityId( $entityType, $id ); + } + return $result; } @@ -894,4 +965,12 @@ return $normalized; } + + /** + * @param $dbr + * @return mixed + */ + public function supportsWeight() { + return !\Wikibase\Settings::get( 'withoutTermWeight' ); + } } diff --git a/repo/config/Wikibase.default.php b/repo/config/Wikibase.default.php index f905118..8a0e331 100644 --- a/repo/config/Wikibase.default.php +++ b/repo/config/Wikibase.default.php @@ -62,6 +62,12 @@ // (depending on the database's collation). 'withoutTermSearchKey' => false, + // Allow the TermIndex table to work without weights, + // for sites that can not easily roll out schema changes on large tables. + // This means that all searches will return an undefined order + // (depending on the database's inner working). + 'withoutTermWeight' => false, + 'entityNamespaces' => array(), // These are used for multilanguage strings that should have a soft length constraint diff --git a/repo/tests/phpunit/includes/store/sql/TermSqlIndexTest.php b/repo/tests/phpunit/includes/store/sql/TermSqlIndexTest.php index 7aebbac..bd5288b 100644 --- a/repo/tests/phpunit/includes/store/sql/TermSqlIndexTest.php +++ b/repo/tests/phpunit/includes/store/sql/TermSqlIndexTest.php @@ -1,6 +1,7 @@ <?php namespace Wikibase\Test; +use Wikibase\DataModel\SimpleSiteLink; use Wikibase\Item; use Wikibase\StringNormalizer; use Wikibase\Term; @@ -101,6 +102,71 @@ } } + /** + * @dataProvider termProvider + * @param $languageCode + * @param $termText + * @param $searchText + * @param boolean $matches + */ + public function testGetMatchingTermsWeights( $languageCode, $termText, $searchText, $matches ) { + /** + * @var \Wikibase\TermSqlIndex $termIndex + */ + $termIndex = $this->getTermIndex(); + + if ( !$termIndex->supportsWeight() ) { + $this->markTestSkipped( "can't test search weight if withoutTermWeight option is set." ); + } + + $termIndex->clear(); + + $item1 = \Wikibase\Item::newEmpty(); + $item1->setId( 42 ); + + $item1->setLabel( $languageCode, $termText ); + $item1->addSimpleSiteLink( new SimpleSiteLink( 'enwiki', 'A' ) ); + + $termIndex->saveTermsOfEntity( $item1 ); + + $item2 = \Wikibase\Item::newEmpty(); + $item2->setId( 23 ); + + $item2->setLabel( $languageCode, $termText ); + $item2->addSimpleSiteLink( new SimpleSiteLink( 'enwiki', 'B' ) ); + $item2->addSimpleSiteLink( new SimpleSiteLink( 'dewiki', 'B' ) ); + $item2->addSimpleSiteLink( new SimpleSiteLink( 'hrwiki', 'B' ) ); + $item2->addSimpleSiteLink( new SimpleSiteLink( 'uzwiki', 'B' ) ); + + $termIndex->saveTermsOfEntity( $item2 ); + + $item3 = \Wikibase\Item::newEmpty(); + $item3->setId( 108 ); + + $item3->setLabel( $languageCode, $termText ); + $item3->addSimpleSiteLink( new SimpleSiteLink( 'hrwiki', 'C' ) ); + $item3->addSimpleSiteLink( new SimpleSiteLink( 'uzwiki', 'C' ) ); + + $termIndex->saveTermsOfEntity( $item3 ); + + $term = new Term(); + $term->setLanguage( $languageCode ); + $term->setText( $searchText ); + + $options = array( + 'caseSensitive' => false, + ); + + $obtainedIDs = $termIndex->getMatchingIDs( array( $term ), \Wikibase\Item::ENTITY_TYPE, $options ); + + $this->assertEquals( $matches ? 3 : 0, count( $obtainedIDs ) ); + + if ( $matches ) { + $expectedResult = array( $item2->getId(), $item3->getId(), $item1->getId() ); + $this->assertArrayEquals( $expectedResult, $obtainedIDs, true ); + } + } + public static function provideGetSearchKey() { return array( array( // #0 -- To view, visit https://gerrit.wikimedia.org/r/73405 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ie09e4932de42676dd5d638d32f76abe0a2200617 Gerrit-PatchSet: 17 Gerrit-Project: mediawiki/extensions/Wikibase Gerrit-Branch: master Gerrit-Owner: Denny Vrandecic <denny.vrande...@wikimedia.de> Gerrit-Reviewer: Aude <aude.w...@gmail.com> Gerrit-Reviewer: Daniel Kinzler <daniel.kinz...@wikimedia.de> Gerrit-Reviewer: Denny Vrandecic <denny.vrande...@wikimedia.de> Gerrit-Reviewer: jenkins-bot _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits