Manybubbles has uploaded a new change for review.
https://gerrit.wikimedia.org/r/102323
Change subject: Use Cirrus's fancy redirect tracing with --from
......................................................................
Use Cirrus's fancy redirect tracing with --from
Date based reindexes aren't handling nasty stuff like redirect loops
properly. Cirrus's runtime code handles that. It is simplest to just
use it during date based indexing.
Bug: 58597
Change-Id: I79f4586036f36e6876c007867801c519420b7d3c
---
M includes/CirrusSearchUpdater.php
M maintenance/forceSearchIndex.php
2 files changed, 43 insertions(+), 11 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch
refs/changes/23/102323/1
diff --git a/includes/CirrusSearchUpdater.php b/includes/CirrusSearchUpdater.php
index 9bf5886..89a664b 100644
--- a/includes/CirrusSearchUpdater.php
+++ b/includes/CirrusSearchUpdater.php
@@ -46,6 +46,22 @@
public static function updateFromTitle( $title ) {
global $wgCirrusSearchShardTimeout,
$wgCirrusSearchClientSideUpdateTimeout;
+ $page = self::traceRedirects( $title );
+ if ( $page ) {
+ self::updatePages( array( $page ), false,
$wgCirrusSearchShardTimeout,
+ $wgCirrusSearchClientSideUpdateTimeout,
self::INDEX_EVERYTHING );
+ }
+ }
+
+ /**
+ * Trace redirects from the title to the destination. Also registers
the title in the
+ * memory of titles updated and detects special pages.
+ * @param Title $title title to trace
+ * @return WikiPage|null wikipage if the $title either isn't a redirect
or resolves to
+ * an updateable page that hasn't been updated yet. Null if the
page has been
+ * updated, is a special page, or the redirects enter a loop.
+ */
+ public static function traceRedirects( $title ) {
// Loop through redirects until we get to the ultimate target
while ( true ) {
$titleText = $title->getFullText();
@@ -53,18 +69,18 @@
// Already indexed this article in this
process. This is mostly useful
// to catch self redirects but has a storied
history of catching strange
// behavior.
- return;
+ return null;
}
// Never. Ever. Index. Negative. Namespaces.
if ( $title->getNamespace() < 0 ) {
- return;
+ return null;
}
$page = WikiPage::factory( $title );
if ( !$page->exists() ) {
wfDebugLog( 'CirrusSearch', "Ignoring an update
for a non-existant page: $titleText" );
- return;
+ return null;
}
$content = $page->getContent();
if ( is_string( $content ) ) {
@@ -78,20 +94,26 @@
if ( $target->equals( $page->getTitle() ) ) {
// This doesn't warn about redirect
loops longer than one but we'll catch those anyway.
wfDebugLog( 'CirrusSearch', "Title
redirecting to itself. Skip indexing" );
- return;
+ return null;
}
wfDebugLog( 'CirrusSearch', "Updating search
index for $title which is a redirect to " . $target->getText() );
$title = $target;
continue;
} else {
- self::updatePages( array( $page ), false,
$wgCirrusSearchShardTimeout,
- $wgCirrusSearchClientSideUpdateTimeout,
self::INDEX_EVERYTHING );
- return;
+ return $page;
}
}
}
/**
+ * Clear the array of already updated/seen pages that is used for
deduplication and redirect
+ * chain detection to free memory.
+ */
+ public static function clearUpdated() {
+ self::$updated = array();
+ }
+
+ /**
* This updates pages in elasticsearch.
*
* $flags includes:
diff --git a/maintenance/forceSearchIndex.php b/maintenance/forceSearchIndex.php
index 7588c63..3935337 100644
--- a/maintenance/forceSearchIndex.php
+++ b/maintenance/forceSearchIndex.php
@@ -42,8 +42,8 @@
public function __construct() {
parent::__construct();
- $this->mDescription = "Force indexing some pages. Setting
neither from nor to will get you a more efficient "
- . "query at the cost of having to reindex by page id
rather than time.\n\n"
+ $this->mDescription = "Force indexing some pages. Setting
--from or --to will switch from id based indexing to "
+ . "date based indexing which uses less efficient
queries and follows redirects.\n\n"
. "Note: All froms are _exclusive_ and all tos are
_inclusive_.\n"
. "Note 2: Setting fromId and toId use the efficient
query so those are ok.";
$this->setBatchSize( 50 );
@@ -271,6 +271,8 @@
wfProfileIn( __METHOD__ . '::decodeResults' );
$result = array();
foreach ( $res as $row ) {
+ // No need to call CirrusSearchUpdater::traceRedirects
here because we know this is a valid page because
+ // it is in the database.
$page = WikiPage::newFromRow( $row,
WikiPage::READ_LATEST );
$content = $page->getContent();
if ( $content === null ) {
@@ -285,8 +287,16 @@
// for large wikis.
$page = null;
} else {
- $target =
$page->getContent()->getUltimateRedirectTarget();
- $page = WikiPage::newFromID(
$target->getArticleID(), WikiPage::READ_LATEST );
+ // We found a redirect. Great. Since
we can't index special pages and redirects to special pages
+ // are totally possible, as well as fun
stuff like redirect loops, we need to use
+ // CirrusSearchUpdater's redirect
tracing logic which is very complete. Also, it returns null on
+ // self redirects. Great!
+ $page =
CirrusSearchUpdater::traceRedirects( $page->getTitle() );
+ // The cost of using the fancy redirect
handling logic is that we have to clear some global state
+ // that cirrus maintains to prevent
duplicate in process updates and to catch redirect loops.
+ // We could use the deplicate
prevention, but we can't afford to let a global array grow without
+ // bounds, so clear it.
+ CirrusSearchUpdater::clearUpdated();
}
}
$update = array(
--
To view, visit https://gerrit.wikimedia.org/r/102323
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I79f4586036f36e6876c007867801c519420b7d3c
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits