Ori.livneh has uploaded a new change for review.
https://gerrit.wikimedia.org/r/233847
Change subject: Move rule tokenization to new AbuseFilterTokenizer class
......................................................................
Move rule tokenization to new AbuseFilterTokenizer class
* Move AbuseFilterParser::nextToken() and the various AbuseFilterParser
properties that accompanied it to a new class, AbuseFilterTokenizer.
* Tokenize rules eagerly and cache the result in APC.
Change-Id: I15f5b5b65e8c4ec4fba3000d7c9fd78b98967d1d
(cherry picked from commit 856f59a0bcd35e3f207b823a9a9d14e134507efb)
---
M AbuseFilter.parser.php
M AbuseFilter.php
A AbuseFilterTokenizer.php
M tests/phpunit/parserTest.php
4 files changed, 250 insertions(+), 208 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/AbuseFilter
refs/changes/47/233847/1
diff --git a/AbuseFilter.parser.php b/AbuseFilter.parser.php
index 75cb508..85b5030 100644
--- a/AbuseFilter.parser.php
+++ b/AbuseFilter.parser.php
@@ -591,21 +591,6 @@
class AbuseFilterParser {
public $mParams, $mCode, $mTokens, $mPos, $mCur, $mShortCircuit,
$mAllowShort, $mLen;
- const COMMENT_START_RE = '/\s*\/\*/A';
- const ID_SYMBOL_RE = '/[0-9A-Za-z_]+/A';
- const OPERATOR_RE =
'/(\!\=\=|\!\=|\!|\*\*|\*|\/|\+|\-|%|&|\||\^|\:\=|\?|\:|\<\=|\<|\>\=|\>|\=\=\=|\=\=|\=)/A';
- const RADIX_RE = '/([0-9A-Fa-f]+(?:\.\d*)?|\.\d+)([bxo])?/Au';
- const WHITESPACE = "\011\012\013\014\015\040";
-
- static $mPunctuation = array(
- ',' => AFPToken::TComma,
- '(' => AFPToken::TBrace,
- ')' => AFPToken::TBrace,
- '[' => AFPToken::TSquareBracket,
- ']' => AFPToken::TSquareBracket,
- ';' => AFPToken::TStatementSeparator,
- );
-
/**
* @var AbuseFilterVariableHolder
*/
@@ -642,39 +627,6 @@
// Functions that affect parser state, and shouldn't be cached.
static $ActiveFunctions = array(
'funcSetVar',
- );
-
- // Order is important. The punctuation-matching regex requires that
- // ** comes before *, etc. They are sorted to make it easy to spot
- // such errors.
- static $mOps = array(
- '!==', '!=', '!', // Inequality
- '**', '*', // Multiplication/exponentiation
- '/', '+', '-', '%', // Other arithmetic
- '&', '|', '^', // Logic
- ':=', // Setting
- '?', ':', // Ternery
- '<=', '<', // Less than
- '>=', '>', // Greater than
- '===', '==', '=', // Equality
- );
-
- static $mBases = array(
- 'b' => 2,
- 'x' => 16,
- 'o' => 8
- );
-
- static $mBaseCharsRegEx = array(
- 2 => '/^[01]+$/',
- 8 => '/^[0-8]+$/',
- 16 => '/^[0-9A-Fa-f]+$/',
- 10 => '/^[0-9.]+$/',
- );
-
- static $mKeywords = array(
- 'in', 'like', 'true', 'false', 'null', 'contains', 'matches',
- 'rlike', 'irlike', 'regex', 'if', 'then', 'else', 'end',
);
static $funcCache = array();
@@ -743,12 +695,7 @@
* @return AFPToken
*/
protected function move() {
- list( $val, $type, $offset ) = self::nextToken( $this->mCode,
$this->mPos );
-
- $token = new AFPToken( $type, $val, $this->mPos );
- $this->mPos = $offset;
-
- return $this->mCur = $token;
+ list( $this->mCur, $this->mPos ) = $this->mTokens[$this->mPos];
}
/**
@@ -815,6 +762,7 @@
function intEval( $code ) {
// Setup, resetting
$this->mCode = $code;
+ $this->mTokens = AbuseFilterTokenizer::tokenize( $code );
$this->mPos = 0;
$this->mLen = strlen( $code );
$this->mShortCircuit = false;
@@ -1443,153 +1391,7 @@
$this->mVars->setVar( $name, $value );
}
- /**
- * @param $code
- * @param $offset
- * @return array
- * @throws AFPException
- * @throws AFPUserVisibleException
- */
- static function readStringLiteral( $code, $offset ) {
- $type = $code[$offset];
- $offset++;
- $length = strlen( $code );
- $token = '';
- while ( $offset < $length ) {
- if ( $code[$offset] === $type ) {
- $offset++;
- return array( $token, AFPToken::TString,
$offset );
- }
- // Performance: Use a PHP function (implemented in C)
- // to scan ahead.
- $addLength = strcspn( $code, $type . "\\", $offset );
- if ( $addLength ) {
- $token .= substr( $code, $offset, $addLength );
- $offset += $addLength;
- } elseif ( $code[$offset] == '\\' ) {
- switch( $code[$offset + 1] ) {
- case '\\':
- $token .= '\\';
- break;
- case $type:
- $token .= $type;
- break;
- case 'n';
- $token .= "\n";
- break;
- case 'r':
- $token .= "\r";
- break;
- case 't':
- $token .= "\t";
- break;
- case 'x':
- $chr = substr( $code, $offset +
2, 2 );
-
- if ( preg_match(
'/^[0-9A-Fa-f]{2}$/', $chr ) ) {
- $chr = base_convert(
$chr, 16, 10 );
- $token .= chr( $chr );
- $offset += 2; # \xXX --
2 done later
- } else {
- $token .= 'x';
- }
- break;
- default:
- $token .= "\\" . $code[$offset
+ 1];
- }
-
- $offset += 2;
-
- } else {
- $token .= $code[$offset];
- $offset++;
- }
- }
- throw new AFPUserVisibleException( 'unclosedstring', $offset,
array() );
- }
-
- /**
- * @param $code
- * @param $offset
- * @return array
- * @throws AFPException
- * @throws AFPUserVisibleException
- */
- static function nextToken( $code, $offset ) {
- $matches = array();
-
- // Read past comments
- while ( preg_match( '/\s*\/\*/A', $code, $matches, 0, $offset )
) {
- $offset = strpos( $code, '*/', $offset ) + 2;
- }
-
- // Spaces
- $offset += strspn( $code, self::WHITESPACE, $offset );
- if ( $offset >= strlen( $code ) ) {
- return array( '', AFPToken::TNone, $offset );
- }
-
- $chr = $code[$offset];
-
- // Punctuation
- if ( isset( self::$mPunctuation[$chr] ) ) {
- return array( $chr, self::$mPunctuation[$chr], $offset
+ 1 );
- }
-
- // String literal
- if ( $chr === '"' || $chr === "'" ) {
- return self::readStringLiteral( $code, $offset );
- }
-
- $matches = array();
-
- // Operators
- if ( preg_match( self::OPERATOR_RE, $code, $matches, 0, $offset
) ) {
- $token = $matches[0];
- return array( $token, AFPToken::TOp, $offset + strlen(
$token ) );
- }
-
- // Numbers
- if ( preg_match( self::RADIX_RE, $code, $matches, 0, $offset )
) {
- $token = $matches[0];
- $input = $matches[1];
- $baseChar = @$matches[2];
- // Sometimes the base char gets mixed in with the rest
of it because
- // the regex targets hex, too.
- // This mostly happens with binary
- if ( !$baseChar && !empty( self::$mBases[ substr(
$input, - 1 ) ] ) ) {
- $baseChar = substr( $input, - 1, 1 );
- $input = substr( $input, 0, - 1 );
- }
-
- $base = $baseChar ? self::$mBases[$baseChar] : 10;
-
- // Check against the appropriate character class for
input validation
-
- if ( preg_match( self::$mBaseCharsRegEx[$base], $input
) ) {
- $num = $base !== 10 ? base_convert( $input,
$base, 10 ) : $input;
- $offset += strlen( $token );
- return ( strpos( $input, '.' ) !== false )
- ? array( floatval( $num ),
AFPToken::TFloat, $offset )
- : array( intval( $num ),
AFPToken::TInt, $offset );
- }
- }
-
- // IDs / Keywords
-
- if ( preg_match( self::ID_SYMBOL_RE, $code, $matches, 0,
$offset ) ) {
- $token = $matches[0];
- $offset += strlen( $token );
- $type = in_array( $token, self::$mKeywords )
- ? AFPToken::TKeyword
- : AFPToken::TID;
- return array( $token, $type, $offset );
- }
-
- throw new AFPUserVisibleException(
- 'unrecognisedtoken', $offset, array( substr( $code,
$offset ) ) );
- }
// Built-in functions
diff --git a/AbuseFilter.php b/AbuseFilter.php
index 7508ef5..c0fce42 100644
--- a/AbuseFilter.php
+++ b/AbuseFilter.php
@@ -30,6 +30,7 @@
$wgAutoloadClasses['AbuseFilter'] = "$dir/AbuseFilter.class.php";
$wgAutoloadClasses['AbuseFilterParser'] = "$dir/AbuseFilter.parser.php";
+$wgAutoloadClasses['AbuseFilterTokenizer'] = "$dir/AbuseFilterTokenizer.php";
$wgAutoloadClasses['AbuseFilterHooks'] = "$dir/AbuseFilter.hooks.php";
$wgAutoloadClasses['SpecialAbuseLog'] = "$dir/special/SpecialAbuseLog.php";
$wgAutoloadClasses['AbuseLogPager'] = "$dir/special/SpecialAbuseLog.php";
diff --git a/AbuseFilterTokenizer.php b/AbuseFilterTokenizer.php
new file mode 100644
index 0000000..aff8509
--- /dev/null
+++ b/AbuseFilterTokenizer.php
@@ -0,0 +1,239 @@
+<?php
+/**
+ * Tokenizer for AbuseFilter rules.
+ */
+class AbuseFilterTokenizer {
+
+ /** @var int Tokenizer cache version. Increment this when changing the
syntax. **/
+ const CACHE_VERSION = 1;
+ const COMMENT_START_RE = '/\s*\/\*/A';
+ const ID_SYMBOL_RE = '/[0-9A-Za-z_]+/A';
+ const OPERATOR_RE =
'/(\!\=\=|\!\=|\!|\*\*|\*|\/|\+|\-|%|&|\||\^|\:\=|\?|\:|\<\=|\<|\>\=|\>|\=\=\=|\=\=|\=)/A';
+ const RADIX_RE = '/([0-9A-Fa-f]+(?:\.\d*)?|\.\d+)([bxo])?/Au';
+ const WHITESPACE = "\011\012\013\014\015\040";
+
+ // Order is important. The punctuation-matching regex requires that
+ // ** comes before *, etc. They are sorted to make it easy to spot
+ // such errors.
+ static $operators = array(
+ '!==', '!=', '!', // Inequality
+ '**', '*', // Multiplication/exponentiation
+ '/', '+', '-', '%', // Other arithmetic
+ '&', '|', '^', // Logic
+ ':=', // Setting
+ '?', ':', // Ternery
+ '<=', '<', // Less than
+ '>=', '>', // Greater than
+ '===', '==', '=', // Equality
+ );
+
+ static $punctuation = array(
+ ',' => AFPToken::TComma,
+ '(' => AFPToken::TBrace,
+ ')' => AFPToken::TBrace,
+ '[' => AFPToken::TSquareBracket,
+ ']' => AFPToken::TSquareBracket,
+ ';' => AFPToken::TStatementSeparator,
+ );
+
+ static $bases = array(
+ 'b' => 2,
+ 'x' => 16,
+ 'o' => 8
+ );
+
+ static $baseCharsRe = array(
+ 2 => '/^[01]+$/',
+ 8 => '/^[0-8]+$/',
+ 16 => '/^[0-9A-Fa-f]+$/',
+ 10 => '/^[0-9.]+$/',
+ );
+
+ static $keywords = array(
+ 'in', 'like', 'true', 'false', 'null', 'contains', 'matches',
+ 'rlike', 'irlike', 'regex', 'if', 'then', 'else', 'end',
+ );
+
+ /**
+ * @param $code
+ * @return array
+ * @throws AFPException
+ * @throws AFPUserVisibleException
+ */
+ static function tokenize( $code ) {
+ static $tokenizerCache = null;
+
+ if ( !$tokenizerCache ) {
+ $tokenizerCache = ObjectCache::newAccelerator( array(),
'hash' );
+ }
+
+ $cacheKey = wfGlobalCacheKey( __CLASS__, self::CACHE_VERSION,
crc32( $code ) );
+ $tokens = $tokenizerCache->get( $cacheKey );
+
+ if ( !$tokens ) {
+ $tokens = array();
+ $curPos = 0;
+
+ do {
+ $prevPos = $curPos;
+ $token = self::nextToken( $code, $curPos );
+ $tokens[ $token->pos ] = array( $token, $curPos
);
+ } while ( $curPos !== $prevPos );
+
+ $tokenizerCache->set( $cacheKey, $tokens, 600 );
+ }
+
+ return $tokens;
+ }
+
+ /**
+ * @param $code
+ * @param $offset
+ * @return AFPToken
+ * @throws AFPException
+ * @throws AFPUserVisibleException
+ */
+ protected static function nextToken( $code, &$offset ) {
+ $matches = array();
+ $start = $offset;
+
+ // Read past comments
+ while ( preg_match( self::COMMENT_START_RE, $code, $matches, 0,
$offset ) ) {
+ $offset = strpos( $code, '*/', $offset ) + 2;
+ }
+
+ // Spaces
+ $offset += strspn( $code, self::WHITESPACE, $offset );
+ if ( $offset >= strlen( $code ) ) {
+ return new AFPToken( AFPToken::TNone, '', $start );
+ }
+
+ $chr = $code[$offset];
+
+ // Punctuation
+ if ( isset( self::$punctuation[$chr] ) ) {
+ $offset++;
+ return new AFPToken( self::$punctuation[$chr], $chr,
$start );
+ }
+
+ // String literal
+ if ( $chr === '"' || $chr === "'" ) {
+ return self::readStringLiteral( $code, $offset, $start
);
+ }
+
+ $matches = array();
+
+ // Operators
+ if ( preg_match( self::OPERATOR_RE, $code, $matches, 0, $offset
) ) {
+ $token = $matches[0];
+ $offset += strlen( $token );
+ return new AFPToken( AFPToken::TOp, $token, $start );
+ }
+
+ // Numbers
+ if ( preg_match( self::RADIX_RE, $code, $matches, 0, $offset )
) {
+ $token = $matches[0];
+ $input = $matches[1];
+ $baseChar = @$matches[2];
+ // Sometimes the base char gets mixed in with the rest
of it because
+ // the regex targets hex, too.
+ // This mostly happens with binary
+ if ( !$baseChar && !empty( self::$bases[ substr(
$input, - 1 ) ] ) ) {
+ $baseChar = substr( $input, - 1, 1 );
+ $input = substr( $input, 0, - 1 );
+ }
+
+ $base = $baseChar ? self::$bases[$baseChar] : 10;
+
+ // Check against the appropriate character class for
input validation
+
+ if ( preg_match( self::$baseCharsRe[$base], $input ) ) {
+ $num = $base !== 10 ? base_convert( $input,
$base, 10 ) : $input;
+ $offset += strlen( $token );
+ return ( strpos( $input, '.' ) !== false )
+ ? new AFPToken( AFPToken::TFloat,
floatval( $num ), $start )
+ : new AFPToken( AFPToken::TInt, intval(
$num ), $start );
+ }
+ }
+
+ // IDs / Keywords
+
+ if ( preg_match( self::ID_SYMBOL_RE, $code, $matches, 0,
$offset ) ) {
+ $token = $matches[0];
+ $offset += strlen( $token );
+ $type = in_array( $token, self::$keywords )
+ ? AFPToken::TKeyword
+ : AFPToken::TID;
+ return new AFPToken( $type, $token, $start );
+ }
+
+ throw new AFPUserVisibleException(
+ 'unrecognisedtoken', $start, array( substr( $code,
$start ) ) );
+ }
+
+ /**
+ * @param $code
+ * @param $offset
+ * @return array
+ * @throws AFPException
+ * @throws AFPUserVisibleException
+ */
+ protected static function readStringLiteral( $code, &$offset, $start ) {
+ $type = $code[$offset];
+ $offset++;
+ $length = strlen( $code );
+ $token = '';
+ while ( $offset < $length ) {
+ if ( $code[$offset] === $type ) {
+ $offset++;
+ return new AFPToken( AFPToken::TString, $token,
$start );
+ }
+
+ // Performance: Use a PHP function (implemented in C)
+ // to scan ahead.
+ $addLength = strcspn( $code, $type . "\\", $offset );
+ if ( $addLength ) {
+ $token .= substr( $code, $offset, $addLength );
+ $offset += $addLength;
+ } elseif ( $code[$offset] == '\\' ) {
+ switch( $code[$offset + 1] ) {
+ case '\\':
+ $token .= '\\';
+ break;
+ case $type:
+ $token .= $type;
+ break;
+ case 'n';
+ $token .= "\n";
+ break;
+ case 'r':
+ $token .= "\r";
+ break;
+ case 't':
+ $token .= "\t";
+ break;
+ case 'x':
+ $chr = substr( $code, $offset +
2, 2 );
+
+ if ( preg_match(
'/^[0-9A-Fa-f]{2}$/', $chr ) ) {
+ $chr = base_convert(
$chr, 16, 10 );
+ $token .= chr( $chr );
+ $offset += 2; # \xXX --
2 done later
+ } else {
+ $token .= 'x';
+ }
+ break;
+ default:
+ $token .= "\\" . $code[$offset
+ 1];
+ }
+
+ $offset += 2;
+
+ } else {
+ $token .= $code[$offset];
+ $offset++;
+ }
+ }
+ throw new AFPUserVisibleException( 'unclosedstring', $offset,
array() );
+ }
+}
diff --git a/tests/phpunit/parserTest.php b/tests/phpunit/parserTest.php
index 20f1767..34c6c9c 100644
--- a/tests/phpunit/parserTest.php
+++ b/tests/phpunit/parserTest.php
@@ -77,23 +77,23 @@
}
/**
- * Ensure that AbsueFilterParser::OPERATOR_RE matches the contents
- * and order of AbuseFilterParser::$mOps.
+ * Ensure that AbsueFilterTokenizer::OPERATOR_RE matches the contents
+ * and order of AbuseFilterTokenizer::$operators.
*/
public function testOperatorRe() {
$operatorRe = '/(' . implode( '|', array_map( function ( $op ) {
return preg_quote( $op, '/' );
- }, AbuseFilterParser::$mOps ) ) . ')/A';
- $this->assertEquals( $operatorRe,
AbuseFilterParser::OPERATOR_RE );
+ }, AbuseFilterTokenizer::$operators ) ) . ')/A';
+ $this->assertEquals( $operatorRe,
AbuseFilterTokenizer::OPERATOR_RE );
}
/**
- * Ensure that AbsueFilterParser::RADIX_RE matches the contents
- * and order of AbuseFilterParser::$mBases.
+ * Ensure that AbsueFilterTokenizer::RADIX_RE matches the contents
+ * and order of AbuseFilterTokenizer::$bases.
*/
public function testRadixRe() {
- $baseClass = implode( '', array_keys(
AbuseFilterParser::$mBases ) );
+ $baseClass = implode( '', array_keys(
AbuseFilterTokenizer::$bases ) );
$radixRe = "/([0-9A-Fa-f]+(?:\.\d*)?|\.\d+)([$baseClass])?/Au";
- $this->assertEquals( $radixRe, AbuseFilterParser::RADIX_RE );
+ $this->assertEquals( $radixRe, AbuseFilterTokenizer::RADIX_RE );
}
}
--
To view, visit https://gerrit.wikimedia.org/r/233847
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I15f5b5b65e8c4ec4fba3000d7c9fd78b98967d1d
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/AbuseFilter
Gerrit-Branch: wmf/1.26wmf19
Gerrit-Owner: Ori.livneh <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits