Denny Vrandecic has uploaded a new change for review. https://gerrit.wikimedia.org/r/73405
Change subject: (bug 43238) Add very simple weighting for entity search (DO NOT MERGE) ...................................................................... (bug 43238) Add very simple weighting for entity search (DO NOT MERGE) A very simple weighting algorithm (number of sitelinks) has been added to do a post-DB-query ranking of search results. Also, this limits the search to 5000 items. It requires Bug 51227 to be resolved first. Change-Id: Ie09e4932de42676dd5d638d32f76abe0a2200617 --- M lib/includes/store/sql/TermSqlIndex.php 1 file changed, 33 insertions(+), 7 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Wikibase refs/changes/05/73405/1 diff --git a/lib/includes/store/sql/TermSqlIndex.php b/lib/includes/store/sql/TermSqlIndex.php index f9c10d8..e324a14 100644 --- a/lib/includes/store/sql/TermSqlIndex.php +++ b/lib/includes/store/sql/TermSqlIndex.php @@ -150,6 +150,14 @@ 'term_entity_type' => $entity->getType() ); + // very simple weighting calculation. + // TODO delegate this to an object of its own + $entityWeight = array(); + if ( $entity instanceof Item ) { + $weight = count( $entity->getSimpleSiteLinks() ) / 1000.0; + $entityWeight['term_weight'] = $weight; + } + wfDebugLog( __CLASS__, __FUNCTION__ . ": updating terms for " . $entity->getId()->getPrefixedId() ); $success = $dbw->delete( @@ -166,7 +174,8 @@ $this->tableName, array_merge( $this->getTermFields( $term ), - $entityIdentifiers + $entityIdentifiers, + $entityWeight ), __METHOD__ ); @@ -539,7 +548,7 @@ * * @param array $terms * @param string $entityType - * @param array $options + * @param array $options There is an implicit LIMIT of 5000 items in this implementation * * @return EntityId[] */ @@ -548,16 +557,19 @@ return array(); } + // this is the maximum limit of search results TODO this should not be hardcoded + $internalLimit = 5000; + $conditions = $this->termsToConditions( $terms, null, $entityType, false, $options ); - $selectionFields = array( 'term_entity_id' ); + $selectionFields = array( 'term_entity_id', 'term_weight' ); $dbr = $this->getReadDb(); $queryOptions = array( 'DISTINCT' ); if ( array_key_exists( 'LIMIT', $options ) && $options['LIMIT'] ) { - $queryOptions['LIMIT'] = $options['LIMIT']; + $queryOptions['LIMIT'] = max( $options['LIMIT'], $internalLimit ); } $obtainedIDs = $dbr->select( @@ -568,12 +580,26 @@ $queryOptions ); - $result = array(); + $entityIds = array(); + $weights = array(); foreach ( $obtainedIDs as $obtainedID ) { - $result[] = new EntityId( $entityType, (int)$obtainedID->term_entity_id ); + $entityIds[] = new EntityId( $entityType, (int)$obtainedID->term_entity_id ); + $weights[] = floatval( $obtainedID->term_weight ); + } + $this->releaseConnection( $dbr ); + + // this is a post-search sorting by weight. This allows us to not require an additional + // index on the wb_terms table that is very big already. This is also why we have + // the internal limit of 5000, since SQL's index would explode in size if we added the + // weight to it here (which would allow us to delegate the sorting to SQL itself) + array_multisort( $weights, SORT_DESC, SORT_NUMERIC, $entityIds ); + + if ( array_key_exists( 'LIMIT', $options ) && $options['LIMIT'] ) { + $result = array_slice( $entityIds, 0, $options['LIMIT'] ); + } else { + $result = $entityIds; } - $this->releaseConnection( $dbr ); return $result; } -- To view, visit https://gerrit.wikimedia.org/r/73405 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ie09e4932de42676dd5d638d32f76abe0a2200617 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/Wikibase Gerrit-Branch: master Gerrit-Owner: Denny Vrandecic <denny.vrande...@wikimedia.de> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits