[MediaWiki-commits] [Gerrit] Move rule tokenization to new AbuseFilterTokenizer class - change (mediawiki...AbuseFilter)

Ori.livneh (Code Review) Tue, 25 Aug 2015 14:06:07 -0700

Ori.livneh has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/233844


Change subject: Move rule tokenization to new AbuseFilterTokenizer class
......................................................................

Move rule tokenization to new AbuseFilterTokenizer class

* Move AbuseFilterParser::nextToken() and the various AbuseFilterParser
  properties that accompanied it to a new class, AbuseFilterTokenizer.
* Tokenize rules eagerly and cache the result in APC.

Change-Id: I15f5b5b65e8c4ec4fba3000d7c9fd78b98967d1d
---
M AbuseFilter.parser.php
M AbuseFilter.php
A AbuseFilterTokenizer.php
M tests/phpunit/parserTest.php
4 files changed, 250 insertions(+), 208 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/AbuseFilter 
refs/changes/44/233844/1

diff --git a/AbuseFilter.parser.php b/AbuseFilter.parser.php
index f4d5cfc..447166f 100644
--- a/AbuseFilter.parser.php
+++ b/AbuseFilter.parser.php
@@ -591,21 +591,6 @@
 class AbuseFilterParser {
        public $mCode, $mTokens, $mPos, $mCur, $mShortCircuit, $mAllowShort, 
$mLen;
 
-       const COMMENT_START_RE = '/\s*\/\*/A';
-       const ID_SYMBOL_RE = '/[0-9A-Za-z_]+/A';
-       const OPERATOR_RE = 
'/(\!\=\=|\!\=|\!|\*\*|\*|\/|\+|\-|%|&|\||\^|\:\=|\?|\:|\<\=|\<|\>\=|\>|\=\=\=|\=\=|\=)/A';
-       const RADIX_RE = '/([0-9A-Fa-f]+(?:\.\d*)?|\.\d+)([bxo])?/Au';
-       const WHITESPACE = "\011\012\013\014\015\040";
-
-       static $mPunctuation = array(
-               ',' => AFPToken::TComma,
-               '(' => AFPToken::TBrace,
-               ')' => AFPToken::TBrace,
-               '[' => AFPToken::TSquareBracket,
-               ']' => AFPToken::TSquareBracket,
-               ';' => AFPToken::TStatementSeparator,
-       );
-
        /**
         * @var AbuseFilterVariableHolder
         */
@@ -642,39 +627,6 @@
        // Functions that affect parser state, and shouldn't be cached.
        static $ActiveFunctions = array(
                'funcSetVar',
-       );
-
-       // Order is important. The punctuation-matching regex requires that
-       //  ** comes before *, etc. They are sorted to make it easy to spot
-       //  such errors.
-       static $mOps = array(
-               '!==', '!=', '!',       // Inequality
-               '**', '*',                      // Multiplication/exponentiation
-               '/', '+', '-', '%', // Other arithmetic
-               '&', '|', '^',          // Logic
-               ':=',                           // Setting
-               '?', ':',                       // Ternery
-               '<=', '<',                      // Less than
-               '>=', '>',                      // Greater than
-               '===', '==', '=',       // Equality
-       );
-
-       static $mBases = array(
-               'b' => 2,
-               'x' => 16,
-               'o' => 8
-       );
-
-       static $mBaseCharsRegEx = array(
-               2  => '/^[01]+$/',
-               8  => '/^[0-8]+$/',
-               16 => '/^[0-9A-Fa-f]+$/',
-               10 => '/^[0-9.]+$/',
-       );
-
-       static $mKeywords = array(
-               'in', 'like', 'true', 'false', 'null', 'contains', 'matches',
-               'rlike', 'irlike', 'regex', 'if', 'then', 'else', 'end',
        );
 
        static $funcCache = array();
@@ -742,12 +694,7 @@
         * @return AFPToken
         */
        protected function move() {
-               list( $val, $type, $offset ) = self::nextToken( $this->mCode, 
$this->mPos );
-
-               $token = new AFPToken( $type, $val, $this->mPos );
-               $this->mPos = $offset;
-
-               return $this->mCur = $token;
+               list( $this->mCur, $this->mPos ) = $this->mTokens[$this->mPos];
        }
 
        /**
@@ -814,6 +761,7 @@
        function intEval( $code ) {
                // Setup, resetting
                $this->mCode = $code;
+               $this->mTokens = AbuseFilterTokenizer::tokenize( $code );
                $this->mPos = 0;
                $this->mLen = strlen( $code );
                $this->mShortCircuit = false;
@@ -1442,153 +1390,7 @@
                $this->mVars->setVar( $name, $value );
        }
 
-       /**
-        * @param $code
-        * @param $offset
-        * @return array
-        * @throws AFPException
-        * @throws AFPUserVisibleException
-        */
-       static function readStringLiteral( $code, $offset ) {
-               $type = $code[$offset];
-               $offset++;
-               $length = strlen( $code );
-               $token = '';
-               while ( $offset < $length ) {
-                       if ( $code[$offset] === $type ) {
-                               $offset++;
-                               return array( $token, AFPToken::TString, 
$offset );
-                       }
 
-                       // Performance: Use a PHP function (implemented in C)
-                       // to scan ahead.
-                       $addLength = strcspn( $code, $type . "\\", $offset );
-                       if ( $addLength ) {
-                               $token .= substr( $code, $offset, $addLength );
-                               $offset += $addLength;
-                       } elseif ( $code[$offset] == '\\' ) {
-                               switch( $code[$offset + 1] ) {
-                                       case '\\':
-                                               $token .= '\\';
-                                               break;
-                                       case $type:
-                                               $token .= $type;
-                                               break;
-                                       case 'n';
-                                               $token .= "\n";
-                                               break;
-                                       case 'r':
-                                               $token .= "\r";
-                                               break;
-                                       case 't':
-                                               $token .= "\t";
-                                               break;
-                                       case 'x':
-                                               $chr = substr( $code, $offset + 
2, 2 );
-
-                                               if ( preg_match( 
'/^[0-9A-Fa-f]{2}$/', $chr ) ) {
-                                                       $chr = base_convert( 
$chr, 16, 10 );
-                                                       $token .= chr( $chr );
-                                                       $offset += 2; # \xXX -- 
2 done later
-                                               } else {
-                                                       $token .= 'x';
-                                               }
-                                               break;
-                                       default:
-                                               $token .= "\\" . $code[$offset 
+ 1];
-                               }
-
-                               $offset += 2;
-
-                       } else {
-                               $token .= $code[$offset];
-                               $offset++;
-                       }
-               }
-               throw new AFPUserVisibleException( 'unclosedstring', $offset, 
array() );
-       }
-
-       /**
-        * @param $code
-        * @param $offset
-        * @return array
-        * @throws AFPException
-        * @throws AFPUserVisibleException
-        */
-       static function nextToken( $code, $offset ) {
-               $matches = array();
-
-               // Read past comments
-               while ( preg_match( '/\s*\/\*/A', $code, $matches, 0, $offset ) 
) {
-                       $offset = strpos( $code, '*/', $offset ) + 2;
-               }
-
-               // Spaces
-               $offset += strspn( $code, self::WHITESPACE, $offset );
-               if ( $offset >= strlen( $code ) ) {
-                       return array( '', AFPToken::TNone, $offset );
-               }
-
-               $chr = $code[$offset];
-
-               // Punctuation
-               if ( isset( self::$mPunctuation[$chr] ) ) {
-                       return array( $chr, self::$mPunctuation[$chr], $offset 
+ 1 );
-               }
-
-               // String literal
-               if ( $chr === '"' || $chr === "'" ) {
-                       return self::readStringLiteral( $code, $offset );
-               }
-
-               $matches = array();
-
-               // Operators
-               if ( preg_match( self::OPERATOR_RE, $code, $matches, 0, $offset 
) ) {
-                       $token = $matches[0];
-                       return array( $token, AFPToken::TOp, $offset + strlen( 
$token ) );
-               }
-
-               // Numbers
-               if ( preg_match( self::RADIX_RE, $code, $matches, 0, $offset ) 
) {
-                       $token = $matches[0];
-                       $input = $matches[1];
-                       $baseChar = @$matches[2];
-                       // Sometimes the base char gets mixed in with the rest 
of it because
-                       // the regex targets hex, too.
-                       // This mostly happens with binary
-                       if ( !$baseChar && !empty( self::$mBases[ substr( 
$input, - 1 ) ] ) ) {
-                               $baseChar = substr( $input, - 1, 1 );
-                               $input = substr( $input, 0, - 1 );
-                       }
-
-                       $base = $baseChar ? self::$mBases[$baseChar] : 10;
-
-                       // Check against the appropriate character class for 
input validation
-
-                       if ( preg_match( self::$mBaseCharsRegEx[$base], $input 
) ) {
-                               $num = $base !== 10 ? base_convert( $input, 
$base, 10 ) : $input;
-                               $offset += strlen( $token );
-                               return ( strpos( $input, '.' ) !== false )
-                                       ? array( floatval( $num ), 
AFPToken::TFloat, $offset )
-                                       : array( intval( $num ), 
AFPToken::TInt, $offset );
-                       }
-               }
-
-               // IDs / Keywords
-
-               if ( preg_match( self::ID_SYMBOL_RE, $code, $matches, 0, 
$offset ) ) {
-                       $token = $matches[0];
-                       $offset += strlen( $token );
-                       $type = in_array( $token, self::$mKeywords )
-                               ? AFPToken::TKeyword
-                               : AFPToken::TID;
-                       return array( $token, $type, $offset );
-               }
-
-               throw new AFPUserVisibleException(
-                       'unrecognisedtoken', $offset, array( substr( $code, 
$offset ) ) );
-       }
 
        // Built-in functions
 
diff --git a/AbuseFilter.php b/AbuseFilter.php
index 7508ef5..c0fce42 100644
--- a/AbuseFilter.php
+++ b/AbuseFilter.php
@@ -30,6 +30,7 @@
 
 $wgAutoloadClasses['AbuseFilter'] = "$dir/AbuseFilter.class.php";
 $wgAutoloadClasses['AbuseFilterParser'] = "$dir/AbuseFilter.parser.php";
+$wgAutoloadClasses['AbuseFilterTokenizer'] = "$dir/AbuseFilterTokenizer.php";
 $wgAutoloadClasses['AbuseFilterHooks'] = "$dir/AbuseFilter.hooks.php";
 $wgAutoloadClasses['SpecialAbuseLog'] = "$dir/special/SpecialAbuseLog.php";
 $wgAutoloadClasses['AbuseLogPager'] = "$dir/special/SpecialAbuseLog.php";
diff --git a/AbuseFilterTokenizer.php b/AbuseFilterTokenizer.php
new file mode 100644
index 0000000..aff8509
--- /dev/null
+++ b/AbuseFilterTokenizer.php
@@ -0,0 +1,239 @@
+<?php
+/**
+ * Tokenizer for AbuseFilter rules.
+ */
+class AbuseFilterTokenizer {
+
+       /** @var int Tokenizer cache version. Increment this when changing the 
syntax. **/
+       const CACHE_VERSION = 1;
+       const COMMENT_START_RE = '/\s*\/\*/A';
+       const ID_SYMBOL_RE = '/[0-9A-Za-z_]+/A';
+       const OPERATOR_RE = 
'/(\!\=\=|\!\=|\!|\*\*|\*|\/|\+|\-|%|&|\||\^|\:\=|\?|\:|\<\=|\<|\>\=|\>|\=\=\=|\=\=|\=)/A';
+       const RADIX_RE = '/([0-9A-Fa-f]+(?:\.\d*)?|\.\d+)([bxo])?/Au';
+       const WHITESPACE = "\011\012\013\014\015\040";
+
+       // Order is important. The punctuation-matching regex requires that
+       //  ** comes before *, etc. They are sorted to make it easy to spot
+       //  such errors.
+       static $operators = array(
+               '!==', '!=', '!',   // Inequality
+               '**', '*',          // Multiplication/exponentiation
+               '/', '+', '-', '%', // Other arithmetic
+               '&', '|', '^',      // Logic
+               ':=',               // Setting
+               '?', ':',           // Ternery
+               '<=', '<',          // Less than
+               '>=', '>',          // Greater than
+               '===', '==', '=',   // Equality
+       );
+
+       static $punctuation = array(
+               ',' => AFPToken::TComma,
+               '(' => AFPToken::TBrace,
+               ')' => AFPToken::TBrace,
+               '[' => AFPToken::TSquareBracket,
+               ']' => AFPToken::TSquareBracket,
+               ';' => AFPToken::TStatementSeparator,
+       );
+
+       static $bases = array(
+               'b' => 2,
+               'x' => 16,
+               'o' => 8
+       );
+
+       static $baseCharsRe = array(
+               2  => '/^[01]+$/',
+               8  => '/^[0-8]+$/',
+               16 => '/^[0-9A-Fa-f]+$/',
+               10 => '/^[0-9.]+$/',
+       );
+
+       static $keywords = array(
+               'in', 'like', 'true', 'false', 'null', 'contains', 'matches',
+               'rlike', 'irlike', 'regex', 'if', 'then', 'else', 'end',
+       );
+
+       /**
+        * @param $code
+        * @return array
+        * @throws AFPException
+        * @throws AFPUserVisibleException
+        */
+       static function tokenize( $code ) {
+               static $tokenizerCache = null;
+
+               if ( !$tokenizerCache ) {
+                       $tokenizerCache = ObjectCache::newAccelerator( array(), 
'hash' );
+               }
+
+               $cacheKey = wfGlobalCacheKey( __CLASS__, self::CACHE_VERSION, 
crc32( $code ) );
+               $tokens = $tokenizerCache->get( $cacheKey );
+
+               if ( !$tokens ) {
+                       $tokens = array();
+                       $curPos = 0;
+
+                       do {
+                               $prevPos = $curPos;
+                               $token = self::nextToken( $code, $curPos );
+                               $tokens[ $token->pos ] = array( $token, $curPos 
);
+                       } while ( $curPos !== $prevPos );
+
+                       $tokenizerCache->set( $cacheKey, $tokens, 600 );
+               }
+
+               return $tokens;
+       }
+
+       /**
+        * @param $code
+        * @param $offset
+        * @return AFPToken
+        * @throws AFPException
+        * @throws AFPUserVisibleException
+        */
+       protected static function nextToken( $code, &$offset ) {
+               $matches = array();
+               $start = $offset;
+
+               // Read past comments
+               while ( preg_match( self::COMMENT_START_RE, $code, $matches, 0, 
$offset ) ) {
+                       $offset = strpos( $code, '*/', $offset ) + 2;
+               }
+
+               // Spaces
+               $offset += strspn( $code, self::WHITESPACE, $offset );
+               if ( $offset >= strlen( $code ) ) {
+                       return new AFPToken( AFPToken::TNone, '', $start );
+               }
+
+               $chr = $code[$offset];
+
+               // Punctuation
+               if ( isset( self::$punctuation[$chr] ) ) {
+                       $offset++;
+                       return new AFPToken( self::$punctuation[$chr], $chr, 
$start );
+               }
+
+               // String literal
+               if ( $chr === '"' || $chr === "'" ) {
+                       return self::readStringLiteral( $code, $offset, $start 
);
+               }
+
+               $matches = array();
+
+               // Operators
+               if ( preg_match( self::OPERATOR_RE, $code, $matches, 0, $offset 
) ) {
+                       $token = $matches[0];
+                       $offset += strlen( $token );
+                       return new AFPToken( AFPToken::TOp, $token, $start );
+               }
+
+               // Numbers
+               if ( preg_match( self::RADIX_RE, $code, $matches, 0, $offset ) 
) {
+                       $token = $matches[0];
+                       $input = $matches[1];
+                       $baseChar = @$matches[2];
+                       // Sometimes the base char gets mixed in with the rest 
of it because
+                       // the regex targets hex, too.
+                       // This mostly happens with binary
+                       if ( !$baseChar && !empty( self::$bases[ substr( 
$input, - 1 ) ] ) ) {
+                               $baseChar = substr( $input, - 1, 1 );
+                               $input = substr( $input, 0, - 1 );
+                       }
+
+                       $base = $baseChar ? self::$bases[$baseChar] : 10;
+
+                       // Check against the appropriate character class for 
input validation
+
+                       if ( preg_match( self::$baseCharsRe[$base], $input ) ) {
+                               $num = $base !== 10 ? base_convert( $input, 
$base, 10 ) : $input;
+                               $offset += strlen( $token );
+                               return ( strpos( $input, '.' ) !== false )
+                                       ? new AFPToken( AFPToken::TFloat, 
floatval( $num ), $start )
+                                       : new AFPToken( AFPToken::TInt, intval( 
$num ), $start );
+                       }
+               }
+
+               // IDs / Keywords
+
+               if ( preg_match( self::ID_SYMBOL_RE, $code, $matches, 0, 
$offset ) ) {
+                       $token = $matches[0];
+                       $offset += strlen( $token );
+                       $type = in_array( $token, self::$keywords )
+                               ? AFPToken::TKeyword
+                               : AFPToken::TID;
+                       return new AFPToken( $type, $token, $start );
+               }
+
+               throw new AFPUserVisibleException(
+                       'unrecognisedtoken', $start, array( substr( $code, 
$start ) ) );
+       }
+
+       /**
+        * @param $code
+        * @param $offset
+        * @return array
+        * @throws AFPException
+        * @throws AFPUserVisibleException
+        */
+       protected static function readStringLiteral( $code, &$offset, $start ) {
+               $type = $code[$offset];
+               $offset++;
+               $length = strlen( $code );
+               $token = '';
+               while ( $offset < $length ) {
+                       if ( $code[$offset] === $type ) {
+                               $offset++;
+                               return new AFPToken( AFPToken::TString, $token, 
$start );
+                       }
+
+                       // Performance: Use a PHP function (implemented in C)
+                       // to scan ahead.
+                       $addLength = strcspn( $code, $type . "\\", $offset );
+                       if ( $addLength ) {
+                               $token .= substr( $code, $offset, $addLength );
+                               $offset += $addLength;
+                       } elseif ( $code[$offset] == '\\' ) {
+                               switch( $code[$offset + 1] ) {
+                                       case '\\':
+                                               $token .= '\\';
+                                               break;
+                                       case $type:
+                                               $token .= $type;
+                                               break;
+                                       case 'n';
+                                               $token .= "\n";
+                                               break;
+                                       case 'r':
+                                               $token .= "\r";
+                                               break;
+                                       case 't':
+                                               $token .= "\t";
+                                               break;
+                                       case 'x':
+                                               $chr = substr( $code, $offset + 
2, 2 );
+
+                                               if ( preg_match( 
'/^[0-9A-Fa-f]{2}$/', $chr ) ) {
+                                                       $chr = base_convert( 
$chr, 16, 10 );
+                                                       $token .= chr( $chr );
+                                                       $offset += 2; # \xXX -- 
2 done later
+                                               } else {
+                                                       $token .= 'x';
+                                               }
+                                               break;
+                                       default:
+                                               $token .= "\\" . $code[$offset 
+ 1];
+                               }
+
+                               $offset += 2;
+
+                       } else {
+                               $token .= $code[$offset];
+                               $offset++;
+                       }
+               }
+               throw new AFPUserVisibleException( 'unclosedstring', $offset, 
array() );
+       }
+}
diff --git a/tests/phpunit/parserTest.php b/tests/phpunit/parserTest.php
index 20f1767..34c6c9c 100644
--- a/tests/phpunit/parserTest.php
+++ b/tests/phpunit/parserTest.php
@@ -77,23 +77,23 @@
        }
 
        /**
-        * Ensure that AbsueFilterParser::OPERATOR_RE matches the contents
-        * and order of AbuseFilterParser::$mOps.
+        * Ensure that AbsueFilterTokenizer::OPERATOR_RE matches the contents
+        * and order of AbuseFilterTokenizer::$operators.
         */
        public function testOperatorRe() {
                $operatorRe = '/(' . implode( '|', array_map( function ( $op ) {
                        return preg_quote( $op, '/' );
-               }, AbuseFilterParser::$mOps ) ) . ')/A';
-               $this->assertEquals( $operatorRe, 
AbuseFilterParser::OPERATOR_RE );
+               }, AbuseFilterTokenizer::$operators ) ) . ')/A';
+               $this->assertEquals( $operatorRe, 
AbuseFilterTokenizer::OPERATOR_RE );
        }
 
        /**
-        * Ensure that AbsueFilterParser::RADIX_RE matches the contents
-        * and order of AbuseFilterParser::$mBases.
+        * Ensure that AbsueFilterTokenizer::RADIX_RE matches the contents
+        * and order of AbuseFilterTokenizer::$bases.
         */
        public function testRadixRe() {
-               $baseClass = implode( '', array_keys( 
AbuseFilterParser::$mBases ) );
+               $baseClass = implode( '', array_keys( 
AbuseFilterTokenizer::$bases ) );
                $radixRe = "/([0-9A-Fa-f]+(?:\.\d*)?|\.\d+)([$baseClass])?/Au";
-               $this->assertEquals( $radixRe, AbuseFilterParser::RADIX_RE );
+               $this->assertEquals( $radixRe, AbuseFilterTokenizer::RADIX_RE );
        }
 }

-- 
To view, visit https://gerrit.wikimedia.org/r/233844
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I15f5b5b65e8c4ec4fba3000d7c9fd78b98967d1d
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/AbuseFilter
Gerrit-Branch: wmf/1.26wmf20
Gerrit-Owner: Ori.livneh <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] Move rule tokenization to new AbuseFilterTokenizer class - change (mediawiki...AbuseFilter)

Reply via email to