Manybubbles has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/102323


Change subject: Use Cirrus's fancy redirect tracing with --from
......................................................................

Use Cirrus's fancy redirect tracing with --from

Date based reindexes aren't handling nasty stuff like redirect loops
properly.  Cirrus's runtime code handles that.  It is simplest to just
use it during date based indexing.

Bug: 58597
Change-Id: I79f4586036f36e6876c007867801c519420b7d3c
---
M includes/CirrusSearchUpdater.php
M maintenance/forceSearchIndex.php
2 files changed, 43 insertions(+), 11 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/23/102323/1

diff --git a/includes/CirrusSearchUpdater.php b/includes/CirrusSearchUpdater.php
index 9bf5886..89a664b 100644
--- a/includes/CirrusSearchUpdater.php
+++ b/includes/CirrusSearchUpdater.php
@@ -46,6 +46,22 @@
        public static function updateFromTitle( $title ) {
                global $wgCirrusSearchShardTimeout, 
$wgCirrusSearchClientSideUpdateTimeout;
 
+               $page = self::traceRedirects( $title );
+               if ( $page ) {
+                       self::updatePages( array( $page ), false, 
$wgCirrusSearchShardTimeout,
+                               $wgCirrusSearchClientSideUpdateTimeout, 
self::INDEX_EVERYTHING );
+               }
+       }
+
+       /**
+        * Trace redirects from the title to the destination.  Also registers 
the title in the
+        * memory of titles updated and detects special pages.
+        * @param Title $title title to trace
+        * @return WikiPage|null wikipage if the $title either isn't a redirect 
or resolves to
+        *    an updateable page that hasn't been updated yet.  Null if the 
page has been
+        *    updated, is a special page, or the redirects enter a loop.
+        */
+       public static function traceRedirects( $title ) {
                // Loop through redirects until we get to the ultimate target
                while ( true ) {
                        $titleText = $title->getFullText();
@@ -53,18 +69,18 @@
                                // Already indexed this article in this 
process.  This is mostly useful
                                // to catch self redirects but has a storied 
history of catching strange
                                // behavior.
-                               return;
+                               return null;
                        }
 
                        // Never. Ever. Index. Negative. Namespaces.
                        if ( $title->getNamespace() < 0 ) {
-                               return;
+                               return null;
                        }
 
                        $page = WikiPage::factory( $title );
                        if ( !$page->exists() ) {
                                wfDebugLog( 'CirrusSearch', "Ignoring an update 
for a non-existant page: $titleText" );
-                               return;
+                               return null;
                        }
                        $content = $page->getContent();
                        if ( is_string( $content ) ) {
@@ -78,20 +94,26 @@
                                if ( $target->equals( $page->getTitle() ) ) {
                                        // This doesn't warn about redirect 
loops longer than one but we'll catch those anyway.
                                        wfDebugLog( 'CirrusSearch', "Title 
redirecting to itself. Skip indexing" );
-                                       return;
+                                       return null;
                                }
                                wfDebugLog( 'CirrusSearch', "Updating search 
index for $title which is a redirect to " . $target->getText() );
                                $title = $target;
                                continue;
                        } else {
-                               self::updatePages( array( $page ), false, 
$wgCirrusSearchShardTimeout,
-                                       $wgCirrusSearchClientSideUpdateTimeout, 
self::INDEX_EVERYTHING );
-                               return;
+                               return $page;
                        }
                }
        }
 
        /**
+        * Clear the array of already updated/seen pages that is used for 
deduplication and redirect
+        * chain detection to free memory.
+        */
+       public static function clearUpdated() {
+               self::$updated = array();
+       }
+
+       /**
         * This updates pages in elasticsearch.
         *
         * $flags includes:
diff --git a/maintenance/forceSearchIndex.php b/maintenance/forceSearchIndex.php
index 7588c63..3935337 100644
--- a/maintenance/forceSearchIndex.php
+++ b/maintenance/forceSearchIndex.php
@@ -42,8 +42,8 @@
 
        public function __construct() {
                parent::__construct();
-               $this->mDescription = "Force indexing some pages.  Setting 
neither from nor to will get you a more efficient "
-                       . "query at the cost of having to reindex by page id 
rather than time.\n\n"
+               $this->mDescription = "Force indexing some pages.  Setting 
--from or --to will switch from id based indexing to "
+                       . "date based indexing which uses less efficient 
queries and follows redirects.\n\n"
                        . "Note: All froms are _exclusive_ and all tos are 
_inclusive_.\n"
                        . "Note 2: Setting fromId and toId use the efficient 
query so those are ok.";
                $this->setBatchSize( 50 );
@@ -271,6 +271,8 @@
                wfProfileIn( __METHOD__ . '::decodeResults' );
                $result = array();
                foreach ( $res as $row ) {
+                       // No need to call CirrusSearchUpdater::traceRedirects 
here because we know this is a valid page because
+                       // it is in the database.
                        $page = WikiPage::newFromRow( $row, 
WikiPage::READ_LATEST );
                        $content = $page->getContent();
                        if ( $content === null ) {
@@ -285,8 +287,16 @@
                                        // for large wikis.
                                        $page = null;
                                } else {
-                                       $target = 
$page->getContent()->getUltimateRedirectTarget();
-                                       $page = WikiPage::newFromID( 
$target->getArticleID(), WikiPage::READ_LATEST );
+                                       // We found a redirect.  Great.  Since 
we can't index special pages and redirects to special pages
+                                       // are totally possible, as well as fun 
stuff like redirect loops, we need to use
+                                       // CirrusSearchUpdater's redirect 
tracing logic which is very complete.  Also, it returns null on
+                                       // self redirects.  Great!
+                                       $page = 
CirrusSearchUpdater::traceRedirects( $page->getTitle() );
+                                       // The cost of using the fancy redirect 
handling logic is that we have to clear some global state
+                                       // that cirrus maintains to prevent 
duplicate in process updates and to catch redirect loops.
+                                       // We could use the deplicate 
prevention, but we can't afford to let a global array grow without
+                                       // bounds, so clear it.
+                                       CirrusSearchUpdater::clearUpdated();
                                }
                        }
                        $update = array(

-- 
To view, visit https://gerrit.wikimedia.org/r/102323
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I79f4586036f36e6876c007867801c519420b7d3c
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to