Milimetric has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/346915 )
Change subject: Simplify and fix EventLogging instrumentation
......................................................................
Simplify and fix EventLogging instrumentation
The ExternalLinksChange schema has instrumentation in this extension.
The former instrumentation was trying to optimize for speed and never
worked properly. I simplified the implementation in the hope of
starting a review process. I am very new at mediawiki development and
so far I only used my intuition and the documentation on Manual:Hooks
Bug: T162365
Change-Id: I455b09fe2e77d3f6faccfdd3e2b4e7940344219e
---
M SpamBlacklistHooks.php
M SpamBlacklist_body.php
2 files changed, 67 insertions(+), 68 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/SpamBlacklist
refs/changes/15/346915/1
diff --git a/SpamBlacklistHooks.php b/SpamBlacklistHooks.php
index eb0d5dd..352c3b2 100644
--- a/SpamBlacklistHooks.php
+++ b/SpamBlacklistHooks.php
@@ -189,7 +189,7 @@
) {
if ( $revision ) {
BaseBlacklist::getInstance( 'spam' )
- ->doLogging( $user, $wikiPage->getTitle(),
$revision->getId() );
+ ->doLogging( $user, $wikiPage->getTitle(),
$revision->getId(), $content );
}
if ( !BaseBlacklist::isLocalSource( $wikiPage->getTitle() ) ) {
@@ -266,10 +266,10 @@
return;
}
- // Log the changes, but we only commit them once the deletion
has happened.
- // We do that since the external links table could get cleared
before the
- // ArticleDeleteComplete hook runs
- $spam->logUrlChanges( $spam->getCurrentLinks(
$article->getTitle() ), [], [] );
+ // The external links table could get cleared before the
deletion is complete,
+ // so stash the old links to log them in onArticleDeleteComplete
+ // NOTE: I would've done that in this class but I'm still
unsure about style
+ $spam->stashOldLinks( $article->getTitle() );
}
/**
@@ -285,6 +285,9 @@
) {
/** @var SpamBlacklist $spam */
$spam = BaseBlacklist::getInstance( 'spam' );
- $spam->doLogging( $user, $page->getTitle(), $page->getLatest()
);
+ // passing null as content means the new version has no links
+ // otherwise since the page is just archived there would be no
change
+ // NOTE: restoring the page will not log that new links are
added
+ $spam->doLogging( $user, $page->getTitle(), $page->getLatest(),
null );
}
}
diff --git a/SpamBlacklist_body.php b/SpamBlacklist_body.php
index 5dc807d..65ab5bd 100644
--- a/SpamBlacklist_body.php
+++ b/SpamBlacklist_body.php
@@ -11,10 +11,10 @@
const STASH_AGE_DYING = 150;
/**
- * Changes to external links, for logging purposes
- * @var array[]
- */
- private $urlChangeLog = array();
+ * old links can be purged before logging, stash here and clear after
using
+ * @var array[]
+ */
+ private $stashedOldLinks = array();
/**
* Returns the code for the blacklist implementation
@@ -55,12 +55,6 @@
$statsd =
MediaWikiServices::getInstance()->getStatsdDataFactory();
$cache = ObjectCache::getLocalClusterInstance();
- // If there are no new links, and we are logging,
- // mark all of the current links as being removed.
- if ( !$links && $this->isLoggingEnabled() ) {
- $this->logUrlChanges( $this->getCurrentLinks( $title ),
[], [] );
- }
-
if ( !$links ) {
return false;
}
@@ -95,25 +89,7 @@
if ( count( $blacklists ) ) {
// poor man's anti-spoof, see bug 12896
$newLinks = array_map( array( $this, 'antiSpoof' ),
$links );
-
- $oldLinks = array();
- if ( $title !== null ) {
- $oldLinks = $this->getCurrentLinks( $title );
- $addedLinks = array_diff( $newLinks, $oldLinks
);
- } else {
- // can't load old links, so treat all links as
added.
- $addedLinks = $newLinks;
- }
-
- wfDebugLog( 'SpamBlacklist', "Old URLs: " . implode( ',
', $oldLinks ) );
wfDebugLog( 'SpamBlacklist', "New URLs: " . implode( ',
', $newLinks ) );
- wfDebugLog( 'SpamBlacklist', "Added URLs: " . implode(
', ', $addedLinks ) );
-
- if ( !$preventLog ) {
- $this->logUrlChanges( $oldLinks, $newLinks,
$addedLinks );
- }
-
- $links = implode( "\n", $addedLinks );
# Strip whitelisted URLs from the match
if( is_array( $whitelists ) ) {
@@ -180,26 +156,8 @@
return $wgSpamBlacklistEventLogging && class_exists(
'EventLogging' );
}
- /**
- * Diff added/removed urls and generate events for them
- *
- * @param string[] $oldLinks
- * @param string[] $newLinks
- * @param string[] $addedLinks
- */
- public function logUrlChanges( $oldLinks, $newLinks, $addedLinks ) {
- if ( !$this->isLoggingEnabled() ) {
- return;
- }
-
- $removedLinks = array_diff( $oldLinks, $newLinks );
- foreach ( $addedLinks as $url ) {
- $this->logUrlChange( $url, 'insert' );
- }
-
- foreach ( $removedLinks as $url ) {
- $this->logUrlChange( $url, 'remove' );
- }
+ public function stashOldLinks( Title $title ) {
+ $this->stashedOldLinks = $this->getCurrentLinks( $title );
}
/**
@@ -209,23 +167,56 @@
* @param Title $title
* @param int $revId
*/
- public function doLogging( User $user, Title $title, $revId ) {
+ public function doLogging( User $user, Title $title, $revId, $content )
{
if ( !$this->isLoggingEnabled() ) {
return;
}
- $baseInfo = array(
- 'revId' => $revId,
- 'pageId' => $title->getArticleID(),
- 'pageNamespace' => $title->getNamespace(),
- 'userId' => $user->getId(),
- 'userText' => $user->getName(),
- );
- $changes = $this->urlChangeLog;
- // Empty the changes queue in case this function gets called
more than once
- $this->urlChangeLog = array();
+ // NOTE: this may be duplicate work in the critical save path,
+ // is that ok? I tested changing 7000 links and it doesn't
slow too much
+ $oldLinks = array();
- DeferredUpdates::addCallableUpdate( function() use ( $changes,
$baseInfo ) {
+ if ( empty( $this->$stashedOldLinks ) && $title !== null ) {
+ $oldLinks = $this->getCurrentLinks( $title );
+ } else {
+ $oldLinks = $this->$stashedOldLinks;
+ $this->$stashedOldLinks = array();
+ }
+
+ DeferredUpdates::addCallableUpdate( function() use ( $user,
$title, $revId, $oldLinks ) {
+
+ // Moving as much logic as possible in deferred update
to make save quick
+ sort($oldLinks);
+
+ $newLinks = array();
+ if ( $title !== null ) {
+ if ($content !== null) {
+ $newLinks = $this->getNewLinks( $title,
$content );
+ sort($newLinks);
+ }
+ }
+
+ $addedLinks = array_diff( $newLinks, $oldLinks );
+ $removedLinks = array_diff( $oldLinks, $newLinks );
+
+ $inserts = array_map(function ($a) {
+ return $this->getLogInfo($a, 'insert');
+ }, $addedLinks);
+
+ $removes = array_map(function ($a) {
+ return $this->getLogInfo($a, 'remove');
+ }, $removedLinks);
+
+ $changes = array_merge($inserts, $removes);
+
+ $baseInfo = array(
+ 'revId' => $revId,
+ 'pageId' => $title->getArticleID(),
+ 'pageNamespace' => $title->getNamespace(),
+ 'userId' => $user->getId(),
+ 'userText' => $user->getName(),
+ );
+
foreach ( $changes as $change ) {
EventLogging::logEvent(
'ExternalLinksChange',
@@ -242,7 +233,7 @@
* @param string $url
* @param string $action 'insert' or 'remove'
*/
- private function logUrlChange( $url, $action ) {
+ private function getLogInfo( $url, $action ) {
$parsed = wfParseUrl( $url );
if ( !isset( $parsed['host'] ) ) {
wfDebugLog( 'SpamBlacklist', "Unable to parse $url" );
@@ -256,8 +247,13 @@
'query' => isset( $parsed['query'] ) ? $parsed['query']
: '',
'fragment' => isset( $parsed['fragment'] ) ?
$parsed['fragment'] : '',
);
+ return $info;
+ }
- $this->urlChangeLog[] = $info;
+ function getNewLinks( Title $title, Content $content ) {
+ $parserOptions =
$content->getContentHandler()->makeParserOptions( 'canonical' );
+ $output = $content->getParserOutput( $title, null,
$parserOptions );
+ return array_keys( $output->getExternalLinks() );
}
/**
--
To view, visit https://gerrit.wikimedia.org/r/346915
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I455b09fe2e77d3f6faccfdd3e2b4e7940344219e
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/SpamBlacklist
Gerrit-Branch: master
Gerrit-Owner: Milimetric <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits