jenkins-bot has submitted this change and it was merged.
Change subject: General cleanup
......................................................................
General cleanup
Removed the $dir variable, made one function return true (the pre-existing
"true;" line likely did nothing), moved one hard-coded English string into
the i18n file, updated/added function documentation, added a meaningful
version number into extension credits.
Change-Id: I2d7255fb3c44853c45c2ed5b3cee8a2e7b16f897
---
M BayesianFilter.Body.php
M BayesianFilter.DBHandler.php
M BayesianFilter.Hooks.php
M BayesianFilter.PageView.php
M BayesianFilter.Tokenizer.php
M BayesianFilter.php
M i18n/en.json
M i18n/qqq.json
8 files changed, 288 insertions(+), 331 deletions(-)
Approvals:
Legoktm: Looks good to me, approved
jenkins-bot: Verified
diff --git a/BayesianFilter.Body.php b/BayesianFilter.Body.php
index 6d5ed92..f886626 100644
--- a/BayesianFilter.Body.php
+++ b/BayesianFilter.Body.php
@@ -2,28 +2,26 @@
class BayesianFilter {
- public function __construct()
- {
+ public function __construct() {
// this is written here instead of autoloader, cause
// tokenizer is written in a way it can be reused by
// other scripts which are not neccesarily part of mediawiki
- if( !class_exists( 'BayesianFilterTokenizer' ) )
- {
+ if ( !class_exists( 'BayesianFilterTokenizer' ) ) {
include( __DIR__ . '/BayesianFilter.Tokenizer.php' );
}
}
/**
- * This function users $wgParser and returns the external links
- * in the content of the page. As of now it is not used anywhere
- * but in the later releases links would have a very big role to
- * play in spam detection
- * @text content of the page which we are inspecting
- * return an array of links
- */
-
- public function getLinks( $text, $title )
- {
+ * This function users $wgParser and returns the external links
+ * in the content of the page.
+ * As of now it is not used anywhere but in the later releases links
would
+ * have a very big role to play in spam detection.
+ *
+ * @param string $text Content of the page which we are inspecting
+ * @param Title $title
+ * @return array An array of links
+ */
+ public function getLinks( $text, $title ) {
global $wgParser, $wgUser;
$options = new ParserOptions();
$modifiedText = $wgParser->preSaveTransform( $text, $title,
$wgUser, $options );
@@ -33,15 +31,14 @@
}
/**
- * This function contains the main logic for spam detection
- * it evaulated whether content of a WikiPage is
- * spam or not.
- * @text content of the page which we are inspecting
- * returns true if the content is spam, false otherwise
- */
-
- public function checkSpam( $text, $title )
- {
+ * This function contains the main logic for spam detection.
+ * It evaluates whether the content of a WikiPage is spam or not.
+ *
+ * @param string $text Content of the page which we are inspecting
+ * @param Title $title
+ * @return bool True if the content is spam, false otherwise
+ */
+ public function checkSpam( $text, $title ) {
$links = $this->getLinks( $text, $title );
$tokenizer = new BayesianFilterTokenizer;
@@ -50,18 +47,18 @@
$words = array();
$token = $tokenizer->tokenize( $text );
- while( $token )
- {
+ while ( $token ) {
$token = strtolower( $token );
- if( !$tokenizer->isStopWord( $token ) )
+ if ( !$tokenizer->isStopWord( $token ) ) {
$words[] = $tokenizer->stem( $token );
+ }
$token = $tokenizer->tokenize();
}
$filterDbHandler = new BayesianFilterDBHandler;
global $wgBayesianFilterWordsChunkSize;
$wordsFrequency;
- $wordsFrequency = $filterDbHandler->getFrequency( $words ,
$wgBayesianFilterWordsChunkSize );
+ $wordsFrequency = $filterDbHandler->getFrequency( $words,
$wgBayesianFilterWordsChunkSize );
$probMsgGivenSpam = 1.0;
$probMsgGivenHam = 1.0;
@@ -69,12 +66,10 @@
$hamCount = isset( $wordsFrequency['ham_count'] ) ?
$wordsFrequency['ham_count'] : 0;
$wordCount = count( $words );
- foreach ($words as $word )
- {
- if( isset( $wordsFrequency[$word] ) )
- {
- $probMsgGivenSpam = $probMsgGivenSpam * (
$wordsFrequency[$word]['spam'] + 1);
- $probMsgGivenHam = $probMsgGivenHam * (
$wordsFrequency[$word]['ham'] + 1);
+ foreach ( $words as $word ) {
+ if ( isset( $wordsFrequency[$word] ) ) {
+ $probMsgGivenSpam = $probMsgGivenSpam * (
$wordsFrequency[$word]['spam'] + 1 );
+ $probMsgGivenHam = $probMsgGivenHam * (
$wordsFrequency[$word]['ham'] + 1 );
}
$probMsgGivenSpam = $probMsgGivenSpam / ( $spamCount +
$wordCount );
@@ -85,12 +80,9 @@
$spamHamCount = $filterDbHandler->getSpamHamCount();
// this if condition is added to prevent divisiob by zero.
- if( !$spamHamCount['spam'] || !$spamHamCount['ham'] )
- {
+ if ( !$spamHamCount['spam'] || !$spamHamCount['ham'] ) {
$spamProb = ( $spamHamCount['spam'] ) / (
$spamHamCount['spam'] + $spamHamCount['ham'] );
- }
- else
- {
+ } else {
$spamProb = 0.5;
}
$hamProb = 1.0 - $spamProb;
@@ -98,40 +90,35 @@
$probMsgGivenSpam = $probMsgGivenSpam * $spamProb;
$probMsgGivenHam = $probMsgGivenHam * $hamProb;
- if( $probMsgGivenSpam > $probMsgGivenHam )
- {
+ if ( $probMsgGivenSpam > $probMsgGivenHam ) {
return true;
- }
- else
- {
- $filterDbHandler->insertFrequencyTable( $words, "ham" );
+ } else {
+ $filterDbHandler->insertFrequencyTable( $words, 'ham' );
return false;
}
}
/**
- * this function trains the DB for ham and spam
- * @text the string to train
- * @category can have two values ham or spam
- * returns nothing
- */
-
- public function train( $text, $category )
- {
-
+ * Trains the database for ham and spam.
+ *
+ * @param string $text The string to train
+ * @param string $category Can have two values, ham or spam
+ * @return void
+ */
+ public function train( $text, $category ) {
$tokenizer = new BayesianFilterTokenizer;
$text = $tokenizer->normalize( $text );
$token = $tokenizer->tokenize( $text );
$words = array();
- while( $token )
- {
+ while ( $token ) {
$token = strtolower( $token );
- if( !$tokenizer->isStopWord( $token ) )
- {
+
+ if ( !$tokenizer->isStopWord( $token ) ) {
$words[] = $tokenizer->stem( $token );
}
+
$token = $tokenizer->tokenize();
}
diff --git a/BayesianFilter.DBHandler.php b/BayesianFilter.DBHandler.php
index e602ed7..d0178d7 100644
--- a/BayesianFilter.DBHandler.php
+++ b/BayesianFilter.DBHandler.php
@@ -3,200 +3,187 @@
class BayesianFilterDBHandler {
/**
- * returns the array of frequency of each element is an array
- * @itemArray an array of items whose frequencies to be returned
- */
-
- private function itemFrequency( $itemArray )
- {
+ * @param array $itemArray Items whose frequencies to be returned
+ * @return array The array of frequency of each element is an array
+ */
+ private function itemFrequency( $itemArray ) {
$itemFrequencyArray = array();
+
foreach ( $itemArray as $item ) {
- if( isset( $itemFrequencyArray[$item] ) )
+ if ( isset( $itemFrequencyArray[$item] ) ) {
$itemFrequencyArray[$item] += 1;
- else
+ } else {
$itemFrequencyArray[$item] = 1;
+ }
}
+
return $itemFrequencyArray;
}
/**
- * returns the spam and ham frequecy of the words in $words array
- * @chunksize defines the size of the chunk that should be queried from
a db in
- * a single db_query. If 0 it means get all the words from words array.
- * @words is the array of words whose frequency are required
- */
-
- public function getFrequency( $words , $chunksize )
- {
+ * @param array $words Array of words whose frequency are required
+ * @param int $chunkSize Defines the size of the chunk that should be
+ * queried from a DB in a single DB query.
+ * If 0, it means get all the words from words
array.
+ * @return array The spam and ham frequecy of the words in $words array
+ */
+ public function getFrequency( $words, $chunkSize ) {
$wordsFrequency = $this->itemFrequency( $words );
$words = array_keys( $wordsFrequency );
$dbr = wfGetDB( DB_SLAVE );
$wordsFrequency = array();
- if( $chunksize )
- {
- //array_chunk returns a multidimensional array, where
is each array
- //is of size = $chunksize
- $wordsMultiArray = array_chunk( $words , $chunksize );
+ if ( $chunkSize ) {
+ // array_chunk returns a multidimensional array, where
is each array
+ // is of size = $chunkSize
+ $wordsMultiArray = array_chunk( $words, $chunkSize );
foreach ( $wordsMultiArray as $words ) {
$res = $dbr->select(
- "word_frequency",
- array( "wf_word", "wf_spam",
"wf_ham" ),
- array( "wf_word" => $words ),
- __METHOD__,
- array()
- );
- foreach ($res as $row ) {
+ 'word_frequency',
+ array( 'wf_word', 'wf_spam', 'wf_ham' ),
+ array( 'wf_word' => $words ),
+ __METHOD__
+ );
+ foreach ( $res as $row ) {
$wordsFrequency[$row->wf_word] =
array();
$wordsFrequency[$row->wf_word]['spam']
= $row->wf_spam;
$wordsFrequency[$row->wf_word]['spam']
= $row->wf_ham;
}
}
- }
- else
- {
+ } else {
$res = $dbr->select(
- "word_frequency",
- array( "wf_word", "wf_spam", "wf_ham" ),
- array( "wf_word" => $words ),
- __METHOD__,
- array()
- );
- foreach ($res as $row ) {
+ 'word_frequency',
+ array( 'wf_word', 'wf_spam', 'wf_ham' ),
+ array( 'wf_word' => $words ),
+ __METHOD__
+ );
+ foreach ( $res as $row ) {
$wordsFrequency[$row->wf_word] = array();
$wordsFrequency[$row->wf_word]['spam'] =
$row->wf_spam;
$wordsFrequency[$row->wf_word]['ham'] =
$row->wf_ham;
}
}
+
$res = $dbr->selectRow(
- "word_frequency",
- array('spam_count' => 'SUM(wf_spam)',
"ham_count" => 'SUM(wf_ham)'),
- array(),
- __METHOD__,
- array()
- );
+ 'word_frequency',
+ array( 'spam_count' => 'SUM(wf_spam)', 'ham_count' =>
'SUM(wf_ham)' ),
+ array(),
+ __METHOD__
+ );
+
$wordsFrequency['spam_count'] = $res->spam_count;
$wordsFrequency['ham_count'] = $res->ham_count;
+
return $wordsFrequency;
}
-
/**
- * updates the frequency of words in words array in ham or spam column
depending
- * upon the value of $category
- * @words whose frequency is to be updated
- * @category in which category it should be updated
- */
-
- public function insertFrequencyTable( $words, $category )
- {
+ * Updates the frequency of words in words array in ham or spam column
depending
+ * upon the value of $category
+ *
+ * @param array $words Words whose frequency is to be updated
+ * @param string $category In which category it should be updated
+ */
+ public function insertFrequencyTable( $words, $category ) {
$wordsFrequency = $this->itemFrequency( $words );
$words = array_keys( $wordsFrequency );
$dbr = wfGetDB( DB_SLAVE );
$res = $dbr->select(
- "word_frequency",
- array( "wf_word", "wf_spam", "wf_ham" ),
- array( "wf_word" => $words ),
- __METHOD__,
- array()
- );
+ 'word_frequency',
+ array( 'wf_word', 'wf_spam', 'wf_ham' ),
+ array( 'wf_word' => $words ),
+ __METHOD__
+ );
+
$exists = array();
+
foreach ( $res as $row ) {
$exists[$row->wf_word] = array();
$exists[$row->wf_word]['spam'] = $row->wf_spam;
$exists[$row->wf_word]['ham'] = $row->wf_ham;
}
+
$dbw = wfGetDB( DB_MASTER );
- $fieldName = "wf_" . $category;
- foreach ( $words as $word )
- {
- if( isset( $exists[$word] ) )
- {
+ $fieldName = 'wf_' . $category;
+
+ foreach ( $words as $word ) {
+ if ( isset( $exists[$word] ) ) {
$dbw->update(
- "word_frequency",
+ 'word_frequency',
array( $fieldName => (
$exists[$word][$category] + $wordsFrequency[$word] ) ),
array( 'wf_word' => $word ),
- __METHOD__,
- array()
+ __METHOD__
);
- }
- else
- {
+ } else {
$dbw->insert(
- "word_frequency",
+ 'word_frequency',
array(
$fieldName =>
$wordsFrequency[$word],
'wf_word' => $word
- ),
- __METHOD__,
- array()
+ ),
+ __METHOD__
);
}
}
}
/**
- * returns the revision text from the revison table
- * which has the revision id same as the undidRevision
- * @undidRevision id of the revision to be returned.
- */
-
- public function getRevertedText( $undidRevision )
- {
+ * Gets the revision text from the revision table
+ * which has the revision ID same as the undidRevision
+ *
+ * @param int $undidRevision ID of the revision to be returned
+ * @return string Revision text
+ */
+ public function getRevertedText( $undidRevision ) {
$dbr = wfGetDB( DB_SLAVE );
$res = $dbr->selectRow(
- array( 'rev' => "revision", 'txt' => "text" ),
- array( 'text' => "old_text" ),
- array( 'rev.rev_id' => $undidRevision,
"txt.old_id = rev.rev_id" ),
- __METHOD__,
- array()
- );
+ array( 'rev' => 'revision', 'txt' => 'text' ),
+ array( 'text' => 'old_text' ),
+ array( 'rev.rev_id' => $undidRevision, 'txt.old_id =
rev.rev_id' ),
+ __METHOD__
+ );
return $res->text;
}
/**
- * insert the text into spam_ham_text table
- * @content the text to be inserted
- * @spam, if true, then the spam field is set as 1, 0 otherwise
- */
-
- public function insertSpamText( $content, $spam=true )
- {
- $dbw = wfGetDB( DB_MASTER ) ;
+ * Insert the text into spam_ham_text table
+ *
+ * @param string $content The text to be inserted
+ * @param bool $spam If true, then the spam field is set as 1, 0
otherwise
+ */
+ public function insertSpamText( $content, $spam = true ) {
+ $dbw = wfGetDB( DB_MASTER );
$dbw->insert(
- "spam_ham_texts",
- array(
- 'sht_spam' => $spam,
- 'sht_text' => $content
- ),
- __METHOD__,
- array()
- );
+ 'spam_ham_texts',
+ array(
+ 'sht_spam' => $spam,
+ 'sht_text' => $content
+ ),
+ __METHOD__
+ );
}
/**
- * returns the number of spam texts as result['spam']
- * and the number or ham texts as result['ham']
- */
-
- public function getSpamHamCount()
- {
+ * @return array The number of spam texts as result['spam']
+ * and the number or ham texts as result['ham']
+ */
+ public function getSpamHamCount() {
$result = array( 'spam' => 0, 'ham' => 0 );
- $dbr = wfGetDB( DB_SLAVE ) ;
+ $dbr = wfGetDB( DB_SLAVE );
$res = $dbr->select(
- "spam_ham_texts",
- array( "sht_id", "sht_spam" ),
- array(),
- __METHOD__,
- array()
- );
- foreach ($res as $row ) {
- if($row->sht_spam == 1)
+ 'spam_ham_texts',
+ array( 'sht_id', 'sht_spam' ),
+ array(),
+ __METHOD__
+ );
+ foreach ( $res as $row ) {
+ if ( $row->sht_spam == 1 ) {
$result['spam']++;
- else
+ } else {
$result['ham']++;
+ }
}
return $result;
}
diff --git a/BayesianFilter.Hooks.php b/BayesianFilter.Hooks.php
index b10eb7c..3e0deb0 100644
--- a/BayesianFilter.Hooks.php
+++ b/BayesianFilter.Hooks.php
@@ -1,26 +1,20 @@
<?php
-
-if ( !defined( 'MEDIAWIKI' ) ) { exit; }
-
/**
-* Hooks for Bayesian Filter Extension
-*/
-
+ * Hooks for Bayesian Filter extension
+ */
class BayesianFilterHooks {
/**
- * Hook function for EditFilterMerged
- * This function runs the content through our filter
- * @editpage is the instance of Editpage, the page is which was edited
- * @content is the content of the page as str
- * @hookErr: error message to return in case the edit is prohibited
- * @summary is the summary entered by the user while editing the page
- * returns true if the article is not spam
- * returns false otherwise, aslo sets the error message accordingly
- */
-
+ * Hook function for EditFilterMerged
+ * This function runs the content through our filter
+ *
+ * @param EditPage $editPage
+ * @param string $content The content of the page
+ * @param string $hookErr Error message to return in case the edit is
prohibited
+ * @param string $summary The edit summary entered by the user while
editing the page
+ * @return bool True if the article is not spam, otherwise false
+ */
public static function filterMerged( EditPage $editPage, $content,
&$hookErr, $summary ) {
-
$context = $editPage->mArticle->getContext();
$request = $context->getRequest();
$filter = new BayesianFilter;
@@ -28,48 +22,39 @@
$undidRevision = $request->getVal( 'wpUndidRevision' );
- if( isset( $undidRevision ) && !empty( $undidRevision ) )
- {
+ if ( isset( $undidRevision ) && !empty( $undidRevision ) ) {
$wpSpam = $request->getVal( 'wpSpam' );
- if( isset( $wpSpam ) )
- {
+ if ( isset( $wpSpam ) ) {
$text = $filterDbHandler->getRevertedText(
$undidRevision );
- $filter->train( $text, "spam" );
- $filterDbHandler->insertSpamText( $text ) ;
+ $filter->train( $text, 'spam' );
+ $filterDbHandler->insertSpamText( $text );
}
- }
- else
- {
+ } else {
$result = $filter->checkSpam( $content,
$editPage->getContextTitle() );
- if( $result )
- {
+ if ( $result ) {
$editPage->spamPageWithContent( $result );
- $hookErr = "Sorry the content on this page is
spam. It cannot be saved";
+ $hookErr = wfMessage(
'bayesianfilter-content-is-spam' )->escaped();
return false;
- }
- else
- {
+ } else {
$filterDbHandler->insertSpamText( $content,
false );
}
}
+
return true;
}
-
/**
- * Hook function for EditPageBeforeEditChecks
- *
- * This hook is run whenever an article is opened for edit. It adds the
"Mark as Spam" checkbox
- * besides "Watch this Page" and "This is a minor edit"
- * @editPage is passed by reference to this function.
- * @checks is an array that is passed by reference to this function. It
is an array of checkboxes
- * @tabindex is the index of current tab.
- * return true in each case
- */
-
-
+ * Hook function for EditPageBeforeEditChecks
+ *
+ * This hook is run whenever an article is opened for edit. It adds the
"Mark as Spam" checkbox
+ * besides "Watch this Page" and "This is a minor edit"
+ *
+ * @param EditPage $editPage
+ * @param array $checks Array of checkboxes
+ * @param int $tabindex The index of current tab
+ * @return bool True in each case
+ */
public static function addFlagSpamCheckbox( &$editPage, &$checks,
&$tabindex ) {
-
$context = $editPage->mArticle->getContext();
$view = new BayesianFilterPageView( $context );
$view->addFlagSpamCheckbox( $checks, $tabindex );
@@ -78,37 +63,40 @@
}
/**
- * This hooks is run whenever any sysop deletes a page
- * We use it for training out Spam Database. We assume,
- * that the article being deleted is done for spam purposes
- * which is true in most cases
- * @artcile is the Wikipage for which the deletion request is made.
- * It is an instance of the WikiPage Class
- * @user is the currently logged in user
- * @reason is the reason(str) for which the article is being deleted.
- * @error error message to be displayed if the article deletion was
prohibited.
- * return true in each case
- */
-
+ * This hooks is run whenever any sysop deletes a page
+ * We use it for training out Spam Database.
+ * We assume that the article being deleted is done for spam purposes,
+ * which is true in most cases.
+ *
+ * @param WikiPage $article The page being deleted
+ * @param User $user The currently logged in user
+ * @param string $reason The reason for which the page is being deleted
+ * @param string $errorMmessage to be displayed if the page deletion
was prohibited.
+ * @return bool True in each case
+ */
public static function onArticleDelete( &$article, User &$user,
&$reason, &$error ) {
$content = $article->getContent();
$text = $content->mText;
$filter = new BayesianFilter;
- $filter->train( $text, "spam" );
+ $filter->train( $text, 'spam' );
$filterDbHandler = new BayesianFilterDBHandler;
- $filterDbHandler->insertSpamText( $text ) ;
- true;
+ $filterDbHandler->insertSpamText( $text );
+ return true;
}
/**
+ * Adds the new, required database tables when the user runs
+ * maintenance/update.php, MediaWiki's core updated script from the
command
+ * line.
+ *
* @param $updater DatabaseUpdater
* @throws MWException
* @return bool
*/
public static function onLoadExtensionSchemaUpdates( $updater = null ) {
- $dir = dirname( __FILE__ );
- $updater-> addExtensionTable( "word_frequency",
"$dir/db_patches/word_frequency.sql" );
- $updater-> addExtensionTable( "spam_ham_texts",
"$dir/db_patches/spam_ham_texts.sql" );
+ $dir = __DIR__;
+ $updater->addExtensionTable( 'word_frequency',
"$dir/db_patches/word_frequency.sql" );
+ $updater->addExtensionTable( 'spam_ham_texts',
"$dir/db_patches/spam_ham_texts.sql" );
return true;
}
}
diff --git a/BayesianFilter.PageView.php b/BayesianFilter.PageView.php
index 86a8251..164fc4b 100644
--- a/BayesianFilter.PageView.php
+++ b/BayesianFilter.PageView.php
@@ -6,25 +6,22 @@
protected $context;
- public function __construct( $context )
- {
+ public function __construct( $context ) {
$this->context = $context;
}
/**
- * It adds the "Mark as Spam" checkbox
- * besides "Watch this Page" and "This is a minor edit"
- * @checkboxes is an array that is passed by reference to this function.
It is an array of checkboxes
- * @tabindex is the index of current tab.
- */
-
- public function addFlagSpamCheckbox( array &$checkboxes, &$tabindex ){
-
+ * It adds the "Mark as Spam" checkbox
+ * besides "Watch this Page" and "This is a minor edit"
+ *
+ * @param array $checkboxes Array of checkboxes
+ * @param int $tabindex Index of current tab
+ */
+ public function addFlagSpamCheckbox( array &$checkboxes, &$tabindex ) {
$request = $this->context->getRequest();
$undo = $request->getVal( 'undo' );
- if( isset( $undo ) )
- {
+ if ( isset( $undo ) ) {
$checkbox = Xml::check(
'wpSpam',
false,
diff --git a/BayesianFilter.Tokenizer.php b/BayesianFilter.Tokenizer.php
index 5128e25..2aafbd9 100644
--- a/BayesianFilter.Tokenizer.php
+++ b/BayesianFilter.Tokenizer.php
@@ -1,44 +1,38 @@
<?php
-
/**
* This class contains the definitions of all the functions used for
tokenizing the input
*/
-
-class BayesianFilterTokenizer{
+class BayesianFilterTokenizer {
/**
- * A wiki text consists of square brackets and ~, == section headling,
signatures.
- * sanitizes removes all such transformations that wiki does.
- */
- public function normalize( $text ){
+ * A wiki text consists of square brackets and ~, == section headling,
signatures.
+ * sanitizes removes all such transformations that wiki does.
+ */
+ public function normalize( $text ) {
+ $text = strip_tags( $text ); // strips the HTML tags like <br
/> and <nowiki>
- $text = strip_tags( $text ); //strips the html tags like <br
/> and <nowiki>
+ // remove the special characters which hold significance in
wiki formatting
+ $specialChars = array( "'", "\"", '=', '--', '*', '|' );
+ $text = str_replace( $specialChars, '', $text );
- //remove the special characters which hold significance in wiki
formatting
- $specialChars = array( "'", "\"", "=", "--", "*", "|" );
- $text = str_replace( $specialChars, "", $text );
-
- //remove the [[]] types of text
+ // remove the [[]] types of text
$pattern = "/\[\[.*?\]\]|{{.*?}}/";
- $text = preg_replace( $pattern, "", $text );
+ $text = preg_replace( $pattern, '', $text );
- //remove links
- $pattern =
"/\b(https?|ftp|file):\/\/[-A-Z0-9+&@#\/%?=~_|$!:,.;]*[A-Z0-9+&@#\/%=~_|$]/i" ;
- $text = preg_replace( $pattern, "", $text );
+ // remove links
+ $pattern =
"/\b(https?|ftp|file):\/\/[-A-Z0-9+&@#\/%?=~_|$!:,.;]*[A-Z0-9+&@#\/%=~_|$]/i";
+ $text = preg_replace( $pattern, '', $text );
- //remove the other special charachters
- $specialChars = array( "[", "]", "{", "}", ":", "/" , ";" ,
"?", "-", "$", "\\");
- $text = str_replace( $specialChars, "", $text );
+ // remove the other special characters
+ $specialChars = array( '[', ']', '{', '}', ':', '/', ';', '?',
'-', '$', "\\" );
+ $text = str_replace( $specialChars, '', $text );
- /*
- * this is done instead of using a autoloader
- * cause then this script can be called from
- * other places
- */
- if( class_exists('Sanitizer') )
- {
- if( method_exists( 'Sanitizer' ,
'decodeCharReferencesAndNormalize' ) )
- {
+ /**
+ * this is done instead of using a autoloader
+ * cause then this script can be called from other places
+ */
+ if ( class_exists( 'Sanitizer' ) ) {
+ if ( method_exists( 'Sanitizer',
'decodeCharReferencesAndNormalize' ) ) {
$text =
Sanitizer::decodeCharReferencesAndNormalize( $text );
}
}
@@ -46,51 +40,43 @@
return $text;
}
-
/**
- * returns an iterator to the next token in the text;
- */
- public function tokenize($text = null){
-
+ * @param string|null $text
+ * @return string An iterator to the next token in the text;
+ */
+ public function tokenize( $text = null ) {
static $tok = true;
+
$delimiters = " \n\t\r,.";
- if( $tok == false )
- {
+ if ( $tok == false ) {
return null;
- }
- elseif( $text )
- {
+ } elseif ( $text ) {
$tok = strtok( $text, $delimiters );
- }
- else
- {
+ } else {
$tok = strtok( $delimiters );
}
+
return $tok;
}
/**
- * returns if a word is present in stopWords or not
- */
- public function isStopWord( $word ){
-
+ * @param string $word Word to check
+ * @return bool Whether a word is present in stopWords (true) or not
(false)
+ */
+ public function isStopWord( $word ) {
static $stopWordDict = false;
- if( !$stopWordDict )
- {
+ if ( !$stopWordDict ) {
$stopWords = array();
- $handle = fopen( __DIR__ . "/StopWords.txt", "r");
- if( $handle )
- {
- while( ( $buffer = fgets( $handle ) ) != false )
- {
+ $handle = fopen( __DIR__ . '/StopWords.txt', 'r' );
+ if ( $handle ) {
+ while ( ( $buffer = fgets( $handle ) ) != false
) {
$stopWord = trim( $buffer );
- $stopWords[] = str_replace("'", "",
$stopWord);
+ $stopWords[] = str_replace( "'", '',
$stopWord );
}
}
- fclose($handle);
- foreach( $stopWords as $stopWord )
- {
+ fclose( $handle );
+ foreach ( $stopWords as $stopWord ) {
$stopWordDict[$stopWord] = 1;
}
}
@@ -99,16 +85,18 @@
}
/**
- * stems a word to its root
- */
- public function stem( $word ){
- /*
- * this is done instead of using a autoloader
- * cause then this script can be called from
- * other places
- */
- if( !class_exists('PorterStemmer') )
- {
+ * Stems a word to its root
+ *
+ * @param string $word Word to stem
+ * @return string Root word
+ */
+ public function stem( $word ) {
+ /**
+ * this is done instead of using a autoloader
+ * cause then this script can be called from
+ * other places
+ */
+ if ( !class_exists( 'PorterStemmer' ) ) {
require_once( __DIR__ . '/Stemmer.php' );
}
diff --git a/BayesianFilter.php b/BayesianFilter.php
index f41c182..5e59000 100644
--- a/BayesianFilter.php
+++ b/BayesianFilter.php
@@ -1,7 +1,14 @@
<?php
-
-# Loader for bayesian filter feature
-# Include this from LocalSettings.php
+/**
+ * Loader for Bayesian filter feature
+ * Include this from LocalSettings.php
+ *
+ * @file
+ * @ingroup Extensions
+ * @version 0.1
+ * @author Anbhav Agarwal
+ * @link https://www.mediawiki.org/wiki/Extension:BayesianFilter Documentation
+ */
if ( !defined( 'MEDIAWIKI' ) ) {
exit;
@@ -10,14 +17,14 @@
$wgExtensionCredits['antispam'][] = array(
'path' => __FILE__,
'name' => 'BayesianFilter',
- 'author' => array( 'Anbhav Agarwal'),
+ 'author' => array( 'Anbhav Agarwal' ),
+ 'version' => '0.1',
'url' =>
'https://www.mediawiki.org/wiki/Extension:BayesianFilter',
'descriptionmsg' => 'bayesianfilter-desc',
);
-$dir = __DIR__ . '/';
$wgMessagesDirs['BayesianFilter'] = __DIR__ . '/i18n';
-$wgExtensionMessagesFiles['BayesianFilter'] = $dir . 'BayesianFilter.i18n.php';
+$wgExtensionMessagesFiles['BayesianFilter'] = __DIR__ .
'/BayesianFilter.i18n.php';
/**
* Array of settings for filter classes
@@ -32,9 +39,9 @@
$wgHooks['LoadExtensionSchemaUpdates'][] =
'BayesianFilterHooks::onLoadExtensionSchemaUpdates';
-$wgAutoloadClasses['BayesianFilterHooks'] = $dir . 'BayesianFilter.Hooks.php';
-$wgAutoloadClasses['BayesianFilterPageView'] = $dir .
'BayesianFilter.PageView.php';
-$wgAutoloadClasses['BayesianFilterDBHandler'] = $dir .
'BayesianFilter.DBHandler.php';
-$wgAutoloadClasses['BayesianFilter'] = $dir . 'BayesianFilter.Body.php';
-$wgAutoloadClasses['BayesianFilterTokenizer'] = $dir .
'BayesianFilter.Tokenizer.php';
-$wgAutoloadClasses['PorterStemmer'] = $dir . 'Stemmer.php';
+$wgAutoloadClasses['BayesianFilterHooks'] = __DIR__ .
'/BayesianFilter.Hooks.php';
+$wgAutoloadClasses['BayesianFilterPageView'] = __DIR__ .
'/BayesianFilter.PageView.php';
+$wgAutoloadClasses['BayesianFilterDBHandler'] = __DIR__ .
'/BayesianFilter.DBHandler.php';
+$wgAutoloadClasses['BayesianFilter'] = __DIR__ . '/BayesianFilter.Body.php';
+$wgAutoloadClasses['BayesianFilterTokenizer'] = __DIR__ .
'/BayesianFilter.Tokenizer.php';
+$wgAutoloadClasses['PorterStemmer'] = __DIR__ . '/Stemmer.php';
diff --git a/i18n/en.json b/i18n/en.json
index 1792d1b..e18b94d 100644
--- a/i18n/en.json
+++ b/i18n/en.json
@@ -4,7 +4,8 @@
"Anbhav Agarwal"
]
},
+ "bayesianfilter-content-is-spam": "Sorry, the content on this page
triggered the spam filter. It cannot be saved.",
"bayesianfilter-desc": "Filters wikitext into spam and hams using Bayesian
techniques",
"bayesianfilter-flag-spam-check-title": "Mark this checkbox if you think
that the edit that you are undoing is a spam edit",
"bayesianfilter-flag-spam-check": "Mark this as spam"
-}
\ No newline at end of file
+}
diff --git a/i18n/qqq.json b/i18n/qqq.json
index cc45a1e..e4fdf98 100644
--- a/i18n/qqq.json
+++ b/i18n/qqq.json
@@ -1,10 +1,12 @@
{
"@metadata": {
"authors": [
+ "Lewis Cawte",
"Raimond Spekking",
"Shirayuki"
]
},
+ "bayesianfilter-content-is-spam": "Message displayed to the user to
tell them the edit could not be saved as the filter identified it as spam",
"bayesianfilter-desc": "{{desc|name=Bayesian
Filter|url=https://www.mediawiki.org/wiki/Extension:BayesianFilter}}\n\nAbout
\"bayesian\", see [[w:Bayesian spam filtering]].\n\n\"ham(s)\" means \"not
spam\".",
"bayesianfilter-flag-spam-check-title": "Title of a checkbox.\n\nSee
also:\n* {{msg-mw|Bayesianfilter-flag-spam-check}}",
"bayesianfilter-flag-spam-check": "Label of a checkbox.\n\nSee also:\n*
{{msg-mw|Bayesianfilter-flag-spam-check-title}}"
--
To view, visit https://gerrit.wikimedia.org/r/161785
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I2d7255fb3c44853c45c2ed5b3cee8a2e7b16f897
Gerrit-PatchSet: 2
Gerrit-Project: mediawiki/extensions/BayesianFilter
Gerrit-Branch: master
Gerrit-Owner: Jack Phoenix <[email protected]>
Gerrit-Reviewer: Legoktm <[email protected]>
Gerrit-Reviewer: Lewis Cawte <[email protected]>
Gerrit-Reviewer: Siebrand <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits