Denny Vrandecic has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/73405


Change subject: (bug 43238) Add very simple weighting for entity search (DO NOT 
MERGE)
......................................................................

(bug 43238) Add very simple weighting for entity search (DO NOT MERGE)

A very simple weighting algorithm (number of sitelinks) has
been added to do a post-DB-query ranking of search results.
Also, this limits the search to 5000 items.
It requires Bug 51227 to be resolved first.

Change-Id: Ie09e4932de42676dd5d638d32f76abe0a2200617
---
M lib/includes/store/sql/TermSqlIndex.php
1 file changed, 33 insertions(+), 7 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Wikibase 
refs/changes/05/73405/1

diff --git a/lib/includes/store/sql/TermSqlIndex.php 
b/lib/includes/store/sql/TermSqlIndex.php
index f9c10d8..e324a14 100644
--- a/lib/includes/store/sql/TermSqlIndex.php
+++ b/lib/includes/store/sql/TermSqlIndex.php
@@ -150,6 +150,14 @@
                        'term_entity_type' => $entity->getType()
                );
 
+               // very simple weighting calculation.
+               // TODO delegate this to an object of its own
+               $entityWeight = array();
+               if ( $entity instanceof Item ) {
+                       $weight = count( $entity->getSimpleSiteLinks() ) / 
1000.0;
+                       $entityWeight['term_weight'] = $weight;
+               }
+
                wfDebugLog( __CLASS__, __FUNCTION__ . ": updating terms for " . 
$entity->getId()->getPrefixedId() );
 
                $success = $dbw->delete(
@@ -166,7 +174,8 @@
                                $this->tableName,
                                array_merge(
                                        $this->getTermFields( $term ),
-                                       $entityIdentifiers
+                                       $entityIdentifiers,
+                                       $entityWeight
                                ),
                                __METHOD__
                        );
@@ -539,7 +548,7 @@
         *
         * @param array $terms
         * @param string $entityType
-        * @param array $options
+        * @param array $options There is an implicit LIMIT of 5000 items in 
this implementation
         *
         * @return EntityId[]
         */
@@ -548,16 +557,19 @@
                        return array();
                }
 
+               // this is the maximum limit of search results TODO this should 
not be hardcoded
+               $internalLimit = 5000;
+
                $conditions = $this->termsToConditions( $terms, null, 
$entityType, false, $options );
 
-               $selectionFields = array( 'term_entity_id' );
+               $selectionFields = array( 'term_entity_id', 'term_weight' );
 
                $dbr = $this->getReadDb();
 
                $queryOptions = array( 'DISTINCT' );
 
                if ( array_key_exists( 'LIMIT', $options ) && $options['LIMIT'] 
) {
-                       $queryOptions['LIMIT'] = $options['LIMIT'];
+                       $queryOptions['LIMIT'] = max( $options['LIMIT'], 
$internalLimit );
                }
 
                $obtainedIDs = $dbr->select(
@@ -568,12 +580,26 @@
                        $queryOptions
                );
 
-               $result = array();
+               $entityIds = array();
+               $weights = array();
                foreach ( $obtainedIDs as $obtainedID ) {
-                       $result[] = new EntityId( $entityType, 
(int)$obtainedID->term_entity_id );
+                       $entityIds[] = new EntityId( $entityType, 
(int)$obtainedID->term_entity_id );
+                       $weights[] = floatval( $obtainedID->term_weight );
+               }
+               $this->releaseConnection( $dbr );
+
+               // this is a post-search sorting by weight. This allows us to 
not require an additional
+               // index on the wb_terms table that is very big already. This 
is also why we have
+               // the internal limit of 5000, since SQL's index would explode 
in size if we added the
+               // weight to it here (which would allow us to delegate the 
sorting to SQL itself)
+               array_multisort( $weights, SORT_DESC, SORT_NUMERIC, $entityIds 
);
+
+               if ( array_key_exists( 'LIMIT', $options ) && $options['LIMIT'] 
) {
+                       $result = array_slice( $entityIds, 0, $options['LIMIT'] 
);
+               } else {
+                       $result = $entityIds;
                }
 
-               $this->releaseConnection( $dbr );
                return $result;
        }
 

-- 
To view, visit https://gerrit.wikimedia.org/r/73405
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ie09e4932de42676dd5d638d32f76abe0a2200617
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Wikibase
Gerrit-Branch: master
Gerrit-Owner: Denny Vrandecic <denny.vrande...@wikimedia.de>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to