jenkins-bot has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/389778 )

Change subject: Add get_matches function
......................................................................


Add get_matches function

Added the get_matches function to store a regex match.

Bug: T179957
Change-Id: I19366ebcaa4d0f007dd675a61c91457dde57f604
---
M i18n/en.json
M i18n/qqq.json
M includes/AbuseFilter.class.php
M includes/parser/AbuseFilterParser.php
M tests/phpunit/parserTest.php
5 files changed, 135 insertions(+), 2 deletions(-)

Approvals:
  jenkins-bot: Verified
  MusikAnimal: Looks good to me, approved



diff --git a/i18n/en.json b/i18n/en.json
index 757e6f6..1a6ccf5 100644
--- a/i18n/en.json
+++ b/i18n/en.json
@@ -231,6 +231,7 @@
        "abusefilter-edit-builder-funcs-norm": "Normalize (norm)",
        "abusefilter-edit-builder-funcs-count": "Number of times string X 
appears in string Y (count)",
        "abusefilter-edit-builder-funcs-rcount": "Number of times regex X 
appears in string Y (rcount)",
+       "abusefilter-edit-builder-funcs-get_matches": "Array of regex matches 
within a text for each capturing group (get_matches)",
        "abusefilter-edit-builder-funcs-rmwhitespace": "Remove whitespace 
(rmwhitespace)",
        "abusefilter-edit-builder-funcs-rmspecials": "Remove special characters 
(rmspecials)",
        "abusefilter-edit-builder-funcs-ip_in_range": "Is IP in range? 
(ip_in_range)",
diff --git a/i18n/qqq.json b/i18n/qqq.json
index 38f17c2..522cc1b 100644
--- a/i18n/qqq.json
+++ b/i18n/qqq.json
@@ -262,6 +262,7 @@
        "abusefilter-edit-builder-funcs-norm": "{{doc-important|Do not 
translate \"'''norm'''\".}} Abuse filter syntax option in a dropdown from the 
group {{msg-mw|abusefilter-edit-builder-group-funcs}}.",
        "abusefilter-edit-builder-funcs-count": "{{doc-important|Do not 
translate \"'''count'''\".}} Abuse filter syntax option in a dropdown from the 
group {{msg-mw|abusefilter-edit-builder-group-funcs}}.",
        "abusefilter-edit-builder-funcs-rcount": "{{doc-important|Do not 
translate \"rcount\".}}\nAbuse filter syntax option in a dropdown from the 
group {{msg-mw|abusefilter-edit-builder-group-funcs}}.\n\n\"regex\" stands for 
\"regular expression\".",
+       "abusefilter-edit-builder-funcs-get_matches": "{{doc-important|Do not 
translate \"get_matches\"}} Abuse filter syntax option in a dropdown from the 
group {{msg-mw|abusefilter-edit-builder-group-funcs}}.",
        "abusefilter-edit-builder-funcs-rmwhitespace": "{{doc-important|Do not 
translate \"rmwhitespace\"}} Abuse filter syntax option in a dropdown from the 
group {{msg-mw|abusefilter-edit-builder-group-funcs}}.",
        "abusefilter-edit-builder-funcs-rmspecials": "{{doc-important|Do not 
translate \"'''rmspecials'''\".}} Abuse filter syntax option in a dropdown from 
the group {{msg-mw|abusefilter-edit-builder-group-funcs}}.",
        "abusefilter-edit-builder-funcs-ip_in_range": "{{doc-important|Do not 
translate \"'''ip_in_range'''\".}} Abuse filter syntax option in a dropdown 
from the group {{msg-mw|abusefilter-edit-builder-group-funcs}}.",
diff --git a/includes/AbuseFilter.class.php b/includes/AbuseFilter.class.php
index 1e936b0..f58a46a 100644
--- a/includes/AbuseFilter.class.php
+++ b/includes/AbuseFilter.class.php
@@ -76,6 +76,7 @@
                        'norm(string)' => 'norm',
                        'count(needle,haystack)' => 'count',
                        'rcount(needle,haystack)' => 'rcount',
+                       'get_matches(needle,haystack)' => 'get_matches',
                        'rmwhitespace(text)' => 'rmwhitespace',
                        'rmspecials(text)' => 'rmspecials',
                        'ip_in_range(ip, range)' => 'ip_in_range',
diff --git a/includes/parser/AbuseFilterParser.php 
b/includes/parser/AbuseFilterParser.php
index 1eb70f7..89f5a62 100644
--- a/includes/parser/AbuseFilterParser.php
+++ b/includes/parser/AbuseFilterParser.php
@@ -10,7 +10,7 @@
         */
        public $mVars;
 
-       // 
length,lcase,ucase,ccnorm,rmdoubles,specialratio,rmspecials,norm,count
+       // 
length,lcase,ucase,ccnorm,rmdoubles,specialratio,rmspecials,norm,count,get_matches
        public static $mFunctions = [
                'lcase' => 'funcLc',
                'ucase' => 'funcUc',
@@ -28,6 +28,7 @@
                'rmwhitespace' => 'funcRMWhitespace',
                'count' => 'funcCount',
                'rcount' => 'funcRCount',
+               'get_matches' => 'funcGetMatches',
                'ip_in_range' => 'funcIPInRange',
                'contains_any' => 'funcContainsAny',
                'substr' => 'funcSubstr',
@@ -193,7 +194,7 @@
         * @param string $code
         * @return AFPData
         */
-       function intEval( $code ) {
+       public function intEval( $code ) {
                // Setup, resetting
                $this->mCode = $code;
                $this->mTokens = AbuseFilterTokenizer::tokenize( $code );
@@ -1025,6 +1026,59 @@
        }
 
        /**
+        * Returns an array of matches of needle in the haystack, the first one 
for the whole regex,
+        * the other ones for every capturing group.
+        *
+        * @param array $args
+        * @return AFPData A list of matches.
+        * @throws AFPUserVisibleException
+        */
+       protected function funcGetMatches( $args ) {
+               if ( count( $args ) < 2 ) {
+                       throw new AFPUserVisibleException(
+                               'notenoughargs',
+                               $this->mCur->pos,
+                               [ 'get_matches', 2, count( $args ) ]
+                       );
+               }
+               $needle = $args[0]->toString();
+               $haystack = $args[1]->toString();
+
+               // Count the amount of capturing groups in the submitted 
pattern.
+               // This way we can return a fixed-dimension array, much easier 
to manage.
+               // First, strip away escaped parentheses
+               $sanitized = preg_replace( '/(\\\\\\\\)*\\\\\(/', '', $needle );
+               // Then strip starting parentheses of non-capturing groups
+               // (also atomics, lookahead and so on, even if not every of 
them is supported)
+               $sanitized = preg_replace( '/\(\?/', '', $sanitized );
+               // Finally create an array of falses with dimension = # of 
capturing groups
+               $groupscount = substr_count( $sanitized, '(' ) + 1;
+               $falsy = array_fill( 0, $groupscount, false );
+
+               // Munge the regex by escaping slashes
+               $needle = preg_replace( '!(\\\\\\\\)*(\\\\)?/!', '$1\/', 
$needle );
+               $needle = "/$needle/u";
+
+               // Suppress and restore are here for the same reason as T177744
+               MediaWiki\suppressWarnings();
+               $check = preg_match( $needle, $haystack, $matches );
+               MediaWiki\restoreWarnings();
+
+               if ( $check === false ) {
+                       throw new AFPUserVisibleException(
+                               'regexfailure',
+                               $this->mCur->pos,
+                               [ 'unspecified error in preg_match()', $needle ]
+                       );
+               }
+
+               // Returned array has non-empty positions identical to the ones 
returned
+               // by the third parameter of a standard preg_match call 
($matches in this case).
+               // We want an union with falsy to return a fixed-dimention 
array.
+               return AFPData::newFromPHPVar( $matches + $falsy );
+       }
+
+       /**
         * @param array $args
         * @return AFPData
         * @throws AFPUserVisibleException
diff --git a/tests/phpunit/parserTest.php b/tests/phpunit/parserTest.php
index ab5ac91..d3b998e 100644
--- a/tests/phpunit/parserTest.php
+++ b/tests/phpunit/parserTest.php
@@ -124,6 +124,7 @@
        }
 
        /**
+        * Data provider for testCondCount method.
         * @return array
         */
        public function condCountCases() {
@@ -137,4 +138,79 @@
                        [ 'a == b & c == d', 1 ],
                ];
        }
+
+       /**
+        * get_matches should throw an exception with an invalid number of 
arguments.
+        * @expectedException AFPUserVisibleException
+        * @covers AbuseFilterParser::funcGetMatches
+        */
+       public function testGetMatchesInvalidArgs() {
+               $parser = self::getParser();
+               $parser->parse( "get_matches('')" );
+       }
+
+       /**
+        * get_matches should throw an exception when given an invalid regular 
expression.
+        * @expectedException AFPUserVisibleException
+        * @covers AbuseFilterParser::funcGetMatches
+        */
+       public function testGetMatchesInvalidRegex() {
+               $parser = self::getParser();
+               $parser->parse( "get_matches('this (should fail')" );
+       }
+
+       /**
+        * Ensure get_matches function captures returns expected output.
+        * @param string $needle Regex to pass to get_matches.
+        * @param string $haystack String to run regex against.
+        * @param string[] $expected The expected values of the matched groups.
+        * @covers AbuseFilterParser::funcGetMatches
+        * @dataProvider getMatchesCases
+        */
+       public function testGetMatches( $needle, $haystack, $expected ) {
+               $parser = self::getParser();
+               $afpData = $parser->intEval( "get_matches('$needle', 
'$haystack')" )->data;
+
+               // Extract matches from AFPData.
+               $matches = array_map( function ( $afpDatum ) {
+                       return $afpDatum->data;
+               }, $afpData );
+
+               $this->assertEquals( $expected, $matches );
+       }
+
+       /**
+        * Data provider for get_matches method.
+        * @return array
+        */
+       public function getMatchesCases() {
+               return [
+                       [
+                               'You say (.*) \(and I say (.*)\)\.',
+                               'You say hello (and I say goodbye).',
+                               [
+                                       'You say hello (and I say goodbye).',
+                                       'hello',
+                                       'goodbye',
+                               ],
+                       ],
+                       [
+                               'I(?: am)? the ((walrus|egg man).*)\!',
+                               'I am the egg man, I am the walrus !',
+                               [
+                                       'I am the egg man, I am the walrus !',
+                                       'egg man, I am the walrus ',
+                                       'egg man',
+                               ],
+                       ],
+                       [
+                               'this (does) not match',
+                               'foo bar',
+                               [
+                                       false,
+                                       false,
+                               ],
+                       ],
+               ];
+       }
 }

-- 
To view, visit https://gerrit.wikimedia.org/r/389778
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I19366ebcaa4d0f007dd675a61c91457dde57f604
Gerrit-PatchSet: 23
Gerrit-Project: mediawiki/extensions/AbuseFilter
Gerrit-Branch: master
Gerrit-Owner: Daimona Eaytoy <[email protected]>
Gerrit-Reviewer: Daimona Eaytoy <[email protected]>
Gerrit-Reviewer: Huji <[email protected]>
Gerrit-Reviewer: Jackmcbarn <[email protected]>
Gerrit-Reviewer: Matěj Suchánek <[email protected]>
Gerrit-Reviewer: Melos <[email protected]>
Gerrit-Reviewer: MusikAnimal <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to