jenkins-bot has submitted this change and it was merged.

Change subject: (bug 43238) Add very simple weighting for entity search
......................................................................


(bug 43238) Add very simple weighting for entity search

A very simple weighting algorithm (number of sitelinks) has
been added to do a post-DB-query ranking of search results.
Also, this limits the search to 5000 items.

DEPLOYMENT NOTE: When deploying without a database update, 
set 'withoutTermWeight' => true.

Change-Id: Ie09e4932de42676dd5d638d32f76abe0a2200617
---
M docs/options.wiki
M lib/includes/store/sql/TermSqlIndex.php
M repo/config/Wikibase.default.php
M repo/tests/phpunit/includes/store/sql/TermSqlIndexTest.php
4 files changed, 159 insertions(+), 7 deletions(-)

Approvals:
  Daniel Kinzler: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/docs/options.wiki b/docs/options.wiki
index 1c6645c..af18355 100644
--- a/docs/options.wiki
+++ b/docs/options.wiki
@@ -50,6 +50,7 @@
 ;defaultStore: The storage engine to use for storing entities. Default: 
'sqlstore'. The is currently no alternative.
 ;idBlacklist: A list of IDs to reserve and skip for new entities. IDs are 
given as integers, the blacklist applies to all types of entities. '''Note:''' 
this may change in the future to allow separate blacklists for different kinds 
of entities.
 ;withoutTermSearchKey: Allow the terms table to work without the 
term_search_key field, for sites that can not easily roll out schema changes on 
large tables. This means that all searches will use exact matching (depending 
on the database's collation). Default: <code>false</code>. This is only needed 
for compatibility with old database layouts.
+;withoutTermWeight: Allow the terms table to work without the term_weight 
field, for sites that can not easily roll out schema changes on large tables. 
This means that all searches will return the results in an undefined order 
(depending on how the database works). Default: <code>false</code>. This is 
only needed for compatibility with old database layouts.
 ;multilang-limits: Limits to impose on multilanguage strings like labels, 
descriptions and such. Supported limits:
 :;length: the maximum length of the string, in characters.
 :Default: <code>array( 'length' => 250 )</code>
diff --git a/lib/includes/store/sql/TermSqlIndex.php 
b/lib/includes/store/sql/TermSqlIndex.php
index f9c10d8..f77488f 100644
--- a/lib/includes/store/sql/TermSqlIndex.php
+++ b/lib/includes/store/sql/TermSqlIndex.php
@@ -158,15 +158,23 @@
                        __METHOD__
                );
 
+               $weightField = array();
+               if ( $this->supportsWeight() ) {
+                       $weightField = array( 'term_weight'  => 
$this->getWeight( $entity ) );
+               }
+
+
                /**
                 * @var Term $term
                 */
+
                foreach ( $entity->getTerms() as $term ) {
                        $success = $dbw->insert(
                                $this->tableName,
                                array_merge(
                                        $this->getTermFields( $term ),
-                                       $entityIdentifiers
+                                       $entityIdentifiers,
+                                       $weightField
                                ),
                                __METHOD__
                        );
@@ -177,6 +185,27 @@
                }
 
                return $success;
+       }
+
+       /**
+        * Calculate a weight the given entity to be used for ranking. Should 
be normalized
+        * between 0 and 1, but that's not a strong constraint.
+        * This implementation relies on sitelinks, and simply takes the number 
of sitelinks
+        * as the weight.
+        *
+        * TODO Should be moved to its own object and be added via dependency 
injection
+        *
+        * @since 0.4
+        *
+        * @param Entity $entity
+        *
+        * @return float weight
+        */
+       protected function getWeight( Entity $entity ) {
+               if ( $entity instanceof Item ) {
+                       return count( $entity->getSimpleSiteLinks() ) / 1000.0;
+               }
+               return 0.0;
        }
 
        /**
@@ -539,7 +568,7 @@
         *
         * @param array $terms
         * @param string $entityType
-        * @param array $options
+        * @param array $options There is an implicit LIMIT of 5000 items in 
this implementation
         *
         * @return EntityId[]
         */
@@ -548,16 +577,32 @@
                        return array();
                }
 
+               // this is the maximum limit of search results TODO this should 
not be hardcoded
+               $internalLimit = 5000;
+
                $conditions = $this->termsToConditions( $terms, null, 
$entityType, false, $options );
+
+               $dbr = $this->getReadDb();
 
                $selectionFields = array( 'term_entity_id' );
 
-               $dbr = $this->getReadDb();
+               // TODO instead of a DB query, get a setting. Should save on a 
few Database round trips.
+               $hasWeight = $this->supportsWeight();
+
+               if ( $hasWeight ) {
+                       $selectionFields[] = 'term_weight';
+               }
 
                $queryOptions = array( 'DISTINCT' );
 
                if ( array_key_exists( 'LIMIT', $options ) && $options['LIMIT'] 
) {
-                       $queryOptions['LIMIT'] = $options['LIMIT'];
+                       if ( $hasWeight ) {
+                               // if we take the weight into account, we need 
to grab basically all hits in order
+                               // to allow for the post-search sorting below.
+                               $queryOptions['LIMIT'] = max( 
$options['LIMIT'], $internalLimit );
+                       } else {
+                               $queryOptions['LIMIT'] = $options['LIMIT'];
+                       }
                }
 
                $obtainedIDs = $dbr->select(
@@ -568,12 +613,38 @@
                        $queryOptions
                );
 
-               $result = array();
-               foreach ( $obtainedIDs as $obtainedID ) {
-                       $result[] = new EntityId( $entityType, 
(int)$obtainedID->term_entity_id );
+               if ( $hasWeight ) {
+                       $weights = array();
+                       foreach ( $obtainedIDs as $obtainedID ) {
+                               $weights[intval( $obtainedID->term_entity_id )] 
= floatval( $obtainedID->term_weight );
+                       }
+
+                       // this is a post-search sorting by weight. This allows 
us to not require an additional
+                       // index on the wb_terms table that is very big 
already. This is also why we have
+                       // the internal limit of 5000, since SQL's index would 
explode in size if we added the
+                       // weight to it here (which would allow us to delegate 
the sorting to SQL itself)
+                       arsort( $weights, SORT_NUMERIC );
+
+                       if ( array_key_exists( 'LIMIT', $options ) && 
$options['LIMIT'] ) {
+                               $ids = array_keys( array_slice( $weights, 0, 
$options['LIMIT'], true ) );
+                       } else {
+                               $ids = array_keys( $weights );
+                       }
+               } else {
+                       $ids = array();
+                       foreach ( $obtainedIDs as $obtainedID ) {
+                               $ids[] = intval( $obtainedID->term_entity_id );
+                       }
                }
 
                $this->releaseConnection( $dbr );
+
+               // turn numbers into entity ids
+               $result = array();
+               foreach ( $ids as $id ) {
+                       $result[] = new EntityId( $entityType, $id );
+               }
+
                return $result;
        }
 
@@ -894,4 +965,12 @@
 
                return $normalized;
        }
+
+       /**
+        * @param $dbr
+        * @return mixed
+        */
+       public function supportsWeight() {
+               return !\Wikibase\Settings::get( 'withoutTermWeight' );
+       }
 }
diff --git a/repo/config/Wikibase.default.php b/repo/config/Wikibase.default.php
index f905118..8a0e331 100644
--- a/repo/config/Wikibase.default.php
+++ b/repo/config/Wikibase.default.php
@@ -62,6 +62,12 @@
                // (depending on the database's collation).
                'withoutTermSearchKey' => false,
 
+               // Allow the TermIndex table to work without weights,
+               // for sites that can not easily roll out schema changes on 
large tables.
+               // This means that all searches will return an undefined order
+               // (depending on the database's inner working).
+               'withoutTermWeight' => false,
+
                'entityNamespaces' => array(),
 
                // These are used for multilanguage strings that should have a 
soft length constraint
diff --git a/repo/tests/phpunit/includes/store/sql/TermSqlIndexTest.php 
b/repo/tests/phpunit/includes/store/sql/TermSqlIndexTest.php
index 7aebbac..bd5288b 100644
--- a/repo/tests/phpunit/includes/store/sql/TermSqlIndexTest.php
+++ b/repo/tests/phpunit/includes/store/sql/TermSqlIndexTest.php
@@ -1,6 +1,7 @@
 <?php
 
 namespace Wikibase\Test;
+use Wikibase\DataModel\SimpleSiteLink;
 use Wikibase\Item;
 use Wikibase\StringNormalizer;
 use Wikibase\Term;
@@ -101,6 +102,71 @@
                }
        }
 
+       /**
+        * @dataProvider termProvider
+        * @param $languageCode
+        * @param $termText
+        * @param $searchText
+        * @param boolean $matches
+        */
+       public function testGetMatchingTermsWeights( $languageCode, $termText, 
$searchText, $matches ) {
+               /**
+                * @var \Wikibase\TermSqlIndex $termIndex
+                */
+               $termIndex = $this->getTermIndex();
+
+               if ( !$termIndex->supportsWeight() ) {
+                       $this->markTestSkipped( "can't test search weight if 
withoutTermWeight option is set." );
+               }
+
+               $termIndex->clear();
+
+               $item1 = \Wikibase\Item::newEmpty();
+               $item1->setId( 42 );
+
+               $item1->setLabel( $languageCode, $termText );
+               $item1->addSimpleSiteLink( new SimpleSiteLink( 'enwiki', 'A' ) 
);
+
+               $termIndex->saveTermsOfEntity( $item1 );
+
+               $item2 = \Wikibase\Item::newEmpty();
+               $item2->setId( 23 );
+
+               $item2->setLabel( $languageCode, $termText );
+               $item2->addSimpleSiteLink( new SimpleSiteLink( 'enwiki', 'B' ) 
);
+               $item2->addSimpleSiteLink( new SimpleSiteLink( 'dewiki', 'B' ) 
);
+               $item2->addSimpleSiteLink( new SimpleSiteLink( 'hrwiki', 'B' ) 
);
+               $item2->addSimpleSiteLink( new SimpleSiteLink( 'uzwiki', 'B' ) 
);
+
+               $termIndex->saveTermsOfEntity( $item2 );
+
+               $item3 = \Wikibase\Item::newEmpty();
+               $item3->setId( 108 );
+
+               $item3->setLabel( $languageCode, $termText );
+               $item3->addSimpleSiteLink( new SimpleSiteLink( 'hrwiki', 'C' ) 
);
+               $item3->addSimpleSiteLink( new SimpleSiteLink( 'uzwiki', 'C' ) 
);
+
+               $termIndex->saveTermsOfEntity( $item3 );
+
+               $term = new Term();
+               $term->setLanguage( $languageCode );
+               $term->setText( $searchText );
+
+               $options = array(
+                       'caseSensitive' => false,
+               );
+
+               $obtainedIDs = $termIndex->getMatchingIDs( array( $term ), 
\Wikibase\Item::ENTITY_TYPE, $options );
+
+               $this->assertEquals( $matches ? 3 : 0, count( $obtainedIDs ) );
+
+               if ( $matches ) {
+                       $expectedResult = array( $item2->getId(), 
$item3->getId(), $item1->getId() );
+                       $this->assertArrayEquals( $expectedResult, 
$obtainedIDs, true );
+               }
+       }
+
        public static function provideGetSearchKey() {
                return array(
                        array( // #0

-- 
To view, visit https://gerrit.wikimedia.org/r/73405
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ie09e4932de42676dd5d638d32f76abe0a2200617
Gerrit-PatchSet: 17
Gerrit-Project: mediawiki/extensions/Wikibase
Gerrit-Branch: master
Gerrit-Owner: Denny Vrandecic <denny.vrande...@wikimedia.de>
Gerrit-Reviewer: Aude <aude.w...@gmail.com>
Gerrit-Reviewer: Daniel Kinzler <daniel.kinz...@wikimedia.de>
Gerrit-Reviewer: Denny Vrandecic <denny.vrande...@wikimedia.de>
Gerrit-Reviewer: jenkins-bot

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to