jenkins-bot has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/389778 )
Change subject: Add get_matches function
......................................................................
Add get_matches function
Added the get_matches function to store a regex match.
Bug: T179957
Change-Id: I19366ebcaa4d0f007dd675a61c91457dde57f604
---
M i18n/en.json
M i18n/qqq.json
M includes/AbuseFilter.class.php
M includes/parser/AbuseFilterParser.php
M tests/phpunit/parserTest.php
5 files changed, 135 insertions(+), 2 deletions(-)
Approvals:
jenkins-bot: Verified
MusikAnimal: Looks good to me, approved
diff --git a/i18n/en.json b/i18n/en.json
index 757e6f6..1a6ccf5 100644
--- a/i18n/en.json
+++ b/i18n/en.json
@@ -231,6 +231,7 @@
"abusefilter-edit-builder-funcs-norm": "Normalize (norm)",
"abusefilter-edit-builder-funcs-count": "Number of times string X
appears in string Y (count)",
"abusefilter-edit-builder-funcs-rcount": "Number of times regex X
appears in string Y (rcount)",
+ "abusefilter-edit-builder-funcs-get_matches": "Array of regex matches
within a text for each capturing group (get_matches)",
"abusefilter-edit-builder-funcs-rmwhitespace": "Remove whitespace
(rmwhitespace)",
"abusefilter-edit-builder-funcs-rmspecials": "Remove special characters
(rmspecials)",
"abusefilter-edit-builder-funcs-ip_in_range": "Is IP in range?
(ip_in_range)",
diff --git a/i18n/qqq.json b/i18n/qqq.json
index 38f17c2..522cc1b 100644
--- a/i18n/qqq.json
+++ b/i18n/qqq.json
@@ -262,6 +262,7 @@
"abusefilter-edit-builder-funcs-norm": "{{doc-important|Do not
translate \"'''norm'''\".}} Abuse filter syntax option in a dropdown from the
group {{msg-mw|abusefilter-edit-builder-group-funcs}}.",
"abusefilter-edit-builder-funcs-count": "{{doc-important|Do not
translate \"'''count'''\".}} Abuse filter syntax option in a dropdown from the
group {{msg-mw|abusefilter-edit-builder-group-funcs}}.",
"abusefilter-edit-builder-funcs-rcount": "{{doc-important|Do not
translate \"rcount\".}}\nAbuse filter syntax option in a dropdown from the
group {{msg-mw|abusefilter-edit-builder-group-funcs}}.\n\n\"regex\" stands for
\"regular expression\".",
+ "abusefilter-edit-builder-funcs-get_matches": "{{doc-important|Do not
translate \"get_matches\"}} Abuse filter syntax option in a dropdown from the
group {{msg-mw|abusefilter-edit-builder-group-funcs}}.",
"abusefilter-edit-builder-funcs-rmwhitespace": "{{doc-important|Do not
translate \"rmwhitespace\"}} Abuse filter syntax option in a dropdown from the
group {{msg-mw|abusefilter-edit-builder-group-funcs}}.",
"abusefilter-edit-builder-funcs-rmspecials": "{{doc-important|Do not
translate \"'''rmspecials'''\".}} Abuse filter syntax option in a dropdown from
the group {{msg-mw|abusefilter-edit-builder-group-funcs}}.",
"abusefilter-edit-builder-funcs-ip_in_range": "{{doc-important|Do not
translate \"'''ip_in_range'''\".}} Abuse filter syntax option in a dropdown
from the group {{msg-mw|abusefilter-edit-builder-group-funcs}}.",
diff --git a/includes/AbuseFilter.class.php b/includes/AbuseFilter.class.php
index 1e936b0..f58a46a 100644
--- a/includes/AbuseFilter.class.php
+++ b/includes/AbuseFilter.class.php
@@ -76,6 +76,7 @@
'norm(string)' => 'norm',
'count(needle,haystack)' => 'count',
'rcount(needle,haystack)' => 'rcount',
+ 'get_matches(needle,haystack)' => 'get_matches',
'rmwhitespace(text)' => 'rmwhitespace',
'rmspecials(text)' => 'rmspecials',
'ip_in_range(ip, range)' => 'ip_in_range',
diff --git a/includes/parser/AbuseFilterParser.php
b/includes/parser/AbuseFilterParser.php
index 1eb70f7..89f5a62 100644
--- a/includes/parser/AbuseFilterParser.php
+++ b/includes/parser/AbuseFilterParser.php
@@ -10,7 +10,7 @@
*/
public $mVars;
- //
length,lcase,ucase,ccnorm,rmdoubles,specialratio,rmspecials,norm,count
+ //
length,lcase,ucase,ccnorm,rmdoubles,specialratio,rmspecials,norm,count,get_matches
public static $mFunctions = [
'lcase' => 'funcLc',
'ucase' => 'funcUc',
@@ -28,6 +28,7 @@
'rmwhitespace' => 'funcRMWhitespace',
'count' => 'funcCount',
'rcount' => 'funcRCount',
+ 'get_matches' => 'funcGetMatches',
'ip_in_range' => 'funcIPInRange',
'contains_any' => 'funcContainsAny',
'substr' => 'funcSubstr',
@@ -193,7 +194,7 @@
* @param string $code
* @return AFPData
*/
- function intEval( $code ) {
+ public function intEval( $code ) {
// Setup, resetting
$this->mCode = $code;
$this->mTokens = AbuseFilterTokenizer::tokenize( $code );
@@ -1025,6 +1026,59 @@
}
/**
+ * Returns an array of matches of needle in the haystack, the first one
for the whole regex,
+ * the other ones for every capturing group.
+ *
+ * @param array $args
+ * @return AFPData A list of matches.
+ * @throws AFPUserVisibleException
+ */
+ protected function funcGetMatches( $args ) {
+ if ( count( $args ) < 2 ) {
+ throw new AFPUserVisibleException(
+ 'notenoughargs',
+ $this->mCur->pos,
+ [ 'get_matches', 2, count( $args ) ]
+ );
+ }
+ $needle = $args[0]->toString();
+ $haystack = $args[1]->toString();
+
+ // Count the amount of capturing groups in the submitted
pattern.
+ // This way we can return a fixed-dimension array, much easier
to manage.
+ // First, strip away escaped parentheses
+ $sanitized = preg_replace( '/(\\\\\\\\)*\\\\\(/', '', $needle );
+ // Then strip starting parentheses of non-capturing groups
+ // (also atomics, lookahead and so on, even if not every of
them is supported)
+ $sanitized = preg_replace( '/\(\?/', '', $sanitized );
+ // Finally create an array of falses with dimension = # of
capturing groups
+ $groupscount = substr_count( $sanitized, '(' ) + 1;
+ $falsy = array_fill( 0, $groupscount, false );
+
+ // Munge the regex by escaping slashes
+ $needle = preg_replace( '!(\\\\\\\\)*(\\\\)?/!', '$1\/',
$needle );
+ $needle = "/$needle/u";
+
+ // Suppress and restore are here for the same reason as T177744
+ MediaWiki\suppressWarnings();
+ $check = preg_match( $needle, $haystack, $matches );
+ MediaWiki\restoreWarnings();
+
+ if ( $check === false ) {
+ throw new AFPUserVisibleException(
+ 'regexfailure',
+ $this->mCur->pos,
+ [ 'unspecified error in preg_match()', $needle ]
+ );
+ }
+
+ // Returned array has non-empty positions identical to the ones
returned
+ // by the third parameter of a standard preg_match call
($matches in this case).
+ // We want an union with falsy to return a fixed-dimention
array.
+ return AFPData::newFromPHPVar( $matches + $falsy );
+ }
+
+ /**
* @param array $args
* @return AFPData
* @throws AFPUserVisibleException
diff --git a/tests/phpunit/parserTest.php b/tests/phpunit/parserTest.php
index ab5ac91..d3b998e 100644
--- a/tests/phpunit/parserTest.php
+++ b/tests/phpunit/parserTest.php
@@ -124,6 +124,7 @@
}
/**
+ * Data provider for testCondCount method.
* @return array
*/
public function condCountCases() {
@@ -137,4 +138,79 @@
[ 'a == b & c == d', 1 ],
];
}
+
+ /**
+ * get_matches should throw an exception with an invalid number of
arguments.
+ * @expectedException AFPUserVisibleException
+ * @covers AbuseFilterParser::funcGetMatches
+ */
+ public function testGetMatchesInvalidArgs() {
+ $parser = self::getParser();
+ $parser->parse( "get_matches('')" );
+ }
+
+ /**
+ * get_matches should throw an exception when given an invalid regular
expression.
+ * @expectedException AFPUserVisibleException
+ * @covers AbuseFilterParser::funcGetMatches
+ */
+ public function testGetMatchesInvalidRegex() {
+ $parser = self::getParser();
+ $parser->parse( "get_matches('this (should fail')" );
+ }
+
+ /**
+ * Ensure get_matches function captures returns expected output.
+ * @param string $needle Regex to pass to get_matches.
+ * @param string $haystack String to run regex against.
+ * @param string[] $expected The expected values of the matched groups.
+ * @covers AbuseFilterParser::funcGetMatches
+ * @dataProvider getMatchesCases
+ */
+ public function testGetMatches( $needle, $haystack, $expected ) {
+ $parser = self::getParser();
+ $afpData = $parser->intEval( "get_matches('$needle',
'$haystack')" )->data;
+
+ // Extract matches from AFPData.
+ $matches = array_map( function ( $afpDatum ) {
+ return $afpDatum->data;
+ }, $afpData );
+
+ $this->assertEquals( $expected, $matches );
+ }
+
+ /**
+ * Data provider for get_matches method.
+ * @return array
+ */
+ public function getMatchesCases() {
+ return [
+ [
+ 'You say (.*) \(and I say (.*)\)\.',
+ 'You say hello (and I say goodbye).',
+ [
+ 'You say hello (and I say goodbye).',
+ 'hello',
+ 'goodbye',
+ ],
+ ],
+ [
+ 'I(?: am)? the ((walrus|egg man).*)\!',
+ 'I am the egg man, I am the walrus !',
+ [
+ 'I am the egg man, I am the walrus !',
+ 'egg man, I am the walrus ',
+ 'egg man',
+ ],
+ ],
+ [
+ 'this (does) not match',
+ 'foo bar',
+ [
+ false,
+ false,
+ ],
+ ],
+ ];
+ }
}
--
To view, visit https://gerrit.wikimedia.org/r/389778
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I19366ebcaa4d0f007dd675a61c91457dde57f604
Gerrit-PatchSet: 23
Gerrit-Project: mediawiki/extensions/AbuseFilter
Gerrit-Branch: master
Gerrit-Owner: Daimona Eaytoy <[email protected]>
Gerrit-Reviewer: Daimona Eaytoy <[email protected]>
Gerrit-Reviewer: Huji <[email protected]>
Gerrit-Reviewer: Jackmcbarn <[email protected]>
Gerrit-Reviewer: Matěj Suchánek <[email protected]>
Gerrit-Reviewer: Melos <[email protected]>
Gerrit-Reviewer: MusikAnimal <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits