Manybubbles has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/81020


Change subject: Update the search index for linked pages.
......................................................................

Update the search index for linked pages.

Also don't blow up if someone redirects to a page that doesn't exist.

Change-Id: Icbf35f5443796085b688996b07703234c9116a0b
---
M CirrusSearch.body.php
M CirrusSearch.php
2 files changed, 60 insertions(+), 5 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/20/81020/1

diff --git a/CirrusSearch.body.php b/CirrusSearch.body.php
index 4c418e1..110153c 100644
--- a/CirrusSearch.body.php
+++ b/CirrusSearch.body.php
@@ -410,16 +410,20 @@
        }
 
        public function update( $id, $title, $text ) {
-               $revision = Revision::loadFromPageId( wfGetDB( DB_SLAVE ), $id 
);
-               $content = $revision->getContent();
                if ( in_array( $id, CirrusSearch::$updated ) ) {
                        // Already indexed $id
                        return;
                }
+               $revision = Revision::loadFromPageId( wfGetDB( DB_SLAVE ), $id 
);
+               $content = $revision->getContent();
                if ( $content->isRedirect() ) {
                        $target = $content->getUltimateRedirectTarget();
                        wfDebugLog( 'CirrusSearch', "Updating search index for 
$title which is a redirect to " . $target->getText() );
                        $targetRevision = Revision::loadFromPageId( wfGetDB( 
DB_SLAVE ), $target->getArticleID() );
+                       // If you are building a redirect to a non-existant 
page then don't error out
+                       if ( $targetRevision === null ) {
+                               return;
+                       }
                        $newUpdate = new SearchUpdate( $target->getArticleID(), 
$target, $targetRevision->getContent() );
                        $newUpdate->doUpdate();
                } else {
@@ -438,10 +442,59 @@
        }
 
        /**
-        * @param $linkUpdate LinksUpdate
+        * Hooked to update the search index for pages when templates that they 
include are changed
+        * and to kick off updating linked articles.
+        * @param $linksUpdate LinksUpdate
         */
-       public static function linksUpdateCompletedHook( $linkUpdate ) {
-               $title = $linkUpdate->getTitle();
+       public static function linksUpdateCompletedHook( $linksUpdate ) {
+               self::updateFromTitle( $linksUpdate->getTitle() );
+               self::updateLinkedArticles( $linksUpdate );
+       }
+
+       /**
+        * Update the search index for articles linked from this article.
+        * @param $linksUpdate LinksUpdate
+        */
+       private static function updateLinkedArticles( $linksUpdate ) {
+               // This could be made more efficient by having LinksUpdate 
return a list of articles who
+               // have been newly linked or newly unlinked.  Those are the 
only articles that we need
+               // to reindex any way.
+
+               // This could also be made more efficient by only updating the 
link counts rather than
+               // reindexing the whole article.
+               global $wgCirrusSearchLinkedArticlesToUpdate;
+
+               // Build a big list of candidate pages who's links we should 
update
+               $candidates = array();
+               foreach ( $linksUpdate->getParserOutput()->getLinks() as $ns => 
$ids ) {
+                       foreach ( $ids as $id ) {
+                               $candidates[] = $id;
+                       }
+               }
+
+               // Pick up to $wgCirrusSearchLinkedArticlesToUpdate links to 
update
+               $chosenCount = min( count( $candidates ), 
$wgCirrusSearchLinkedArticlesToUpdate );
+               if ( $chosenCount < 1 ) {
+                       return;
+               }
+               $chosen = array_rand( $candidates, $chosenCount );
+               // array_rand is $chosenCount === 1 then array_rand will return 
a key rather than an
+               // array of keys so just wrap the key and move on with the rest 
of the request.
+               if ( !is_array( $chosen ) ) {
+                       $chosen = array( $chosen );
+               }
+               foreach ( $chosen as $key ) {
+                       $title = Title::newFromID( $candidates[ $key ] );
+                       // Skip links to non-existant pages.
+                       if ( $title === null ) {
+                               continue;
+                       }
+                       wfDebugLog( 'CirrusSearch', "Updating $title because it 
was linked." );
+                       self::updateFromTitle( $title );
+               }
+       }
+
+       private static function updateFromTitle( $title ) {
                $articleId = $title->getArticleID();
                $revision = Revision::loadFromPageId( wfGetDB( DB_SLAVE ), 
$articleId );
 
diff --git a/CirrusSearch.php b/CirrusSearch.php
index b863e2a..32e5b8f 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -59,6 +59,8 @@
 // Maximum number of redirects per target page to index.  
 $wgCirrusSearchIndexedRedirects = 1024;
 
+// Maximum number of linked articles to update every time an article changes.
+$wgCirrusSearchLinkedArticlesToUpdate = 5;
 
 
 $dir = __DIR__ . '/';

-- 
To view, visit https://gerrit.wikimedia.org/r/81020
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Icbf35f5443796085b688996b07703234c9116a0b
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to