Thiemo Mättig (WMDE) has uploaded a new change for review. https://gerrit.wikimedia.org/r/120777
Change subject: Make NumberUnlocalizer aware of incomplete $separatorTransformTable ...................................................................... Make NumberUnlocalizer aware of incomplete $separatorTransformTable Very few (currently five) languages replace only one of the two separator characters, e.g. by setting $separatorTransformTable = array( ',' => ' ' ); instead of $separatorTransformTable = array( ',' => ' ', '.' => '.' ); That's perfectly fine. Code doing the inverse operation (unlocalizing) shouldn't rely on the presence of both characters in the array keys. The canonical characters should simply be hard coded ',' and '.'. There are no constants for these as far as I can tell. Same for $digitTransformTable if a language does not replace all ten digits, e.g. $digitTransformTable = array( '0' => '?' ); We currently don't have a language that does that. This problem does not occur at wikidata.org currently and probably will not anyway since the parse API calls miss the "lang" option. http://localhost/repowiki/api.php?action=wbparsevalue&format=json&parser=quantity&values=1.2&options={"lang":"la"} Change-Id: I754fe953895212d190f40711cec4dd20f71aa19d --- M lib/includes/parsers/MediaWikiNumberUnlocalizer.php M lib/tests/phpunit/parsers/MediaWikiNumberUnlocalizerTest.php 2 files changed, 29 insertions(+), 17 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Wikibase refs/changes/77/120777/1 diff --git a/lib/includes/parsers/MediaWikiNumberUnlocalizer.php b/lib/includes/parsers/MediaWikiNumberUnlocalizer.php index 5af1e20..e669e16 100644 --- a/lib/includes/parsers/MediaWikiNumberUnlocalizer.php +++ b/lib/includes/parsers/MediaWikiNumberUnlocalizer.php @@ -57,36 +57,31 @@ * Constructs a regular expression based on Language::digitTransformTable() * and Language::separatorTransformTable(). * - * @param string $delim the regex delimiter, used for escaping. + * @param string $delimiter The regex delimiter, used for escaping. * * @return string regular expression */ - public function getNumberRegex( $delim = '/' ) { + public function getNumberRegex( $delimiter = '/' ) { $digitMap = $this->language->digitTransformTable(); $separatorMap = $this->language->separatorTransformTable(); - if ( empty( $digitMap ) ) { - $numerals = '0123456789'; - } else { - $numerals = implode( '', array_keys( $digitMap ) ) // accept canonical numerals - . implode( '', array_values( $digitMap ) ); // ...and localized numerals - } + // Always accept canonical digits and separators + $characters = '0123456789,.'; - if ( empty( $separatorMap ) ) { - $separators = '.,'; - } else { - $separators = implode( '', array_keys( $separatorMap ) ) // accept canonical separators - . implode( '', array_values( $separatorMap ) ); // ...and localized separators + // Add localized digits and separators + if ( is_array( $digitMap ) ) { + $characters .= implode( '', array_values( $digitMap ) ); } - - $characters = $numerals . $separators; + if ( is_array( $separatorMap ) ) { + $characters .= implode( '', array_values( $separatorMap ) ); + } // if any whitespace characters are acceptable, also accept a regular blank. if ( preg_match( '/\s/u', $characters ) ) { - $characters = $characters . ' '; + $characters .= ' '; } - return '[-+]?[' . preg_quote( $characters, $delim ) . ']+'; + return '[-+]?[' . preg_quote( $characters, $delimiter ) . ']+'; } } diff --git a/lib/tests/phpunit/parsers/MediaWikiNumberUnlocalizerTest.php b/lib/tests/phpunit/parsers/MediaWikiNumberUnlocalizerTest.php index 68c7313..8c5e8fd 100644 --- a/lib/tests/phpunit/parsers/MediaWikiNumberUnlocalizerTest.php +++ b/lib/tests/phpunit/parsers/MediaWikiNumberUnlocalizerTest.php @@ -19,6 +19,10 @@ */ class MediaWikiNumberUnlocalizerTest extends \PHPUnit_Framework_TestCase { + /** + * @return array[] Array of arrays of three strings: localized value, language code and expected + * canonical value + */ public function provideUnlocalize() { return array( array( '1', 'en', '1' ), @@ -47,6 +51,10 @@ $this->assertEquals( $canonical, $unlocalized ); } + /** + * @return array[] Array of arrays of two or three values: number, language code and optional + * expected canonical value + */ public function provideLocalizationRoundTrip() { $numbers = array( 12, -4.111, 12345678 ); $languages = array( @@ -83,6 +91,9 @@ $this->assertEquals( $canonical, $unlocalized ); } + /** + * @return array[] Array of arrays of one or two strings: value and optional language code + */ public function provideGetNumberRegexMatch() { return array( array( '5' ), @@ -104,6 +115,8 @@ array( '12.345,77', 'de' ), array( "12\xc2\xa0345,77", 'sv' ), // non-breaking space, as generated by the formatter array( "12 345,77", 'sv' ), // regular space, as might be entered by users + + array( "1\xc2\xa0234.56", 'la' ), // incomplete separatorTransformTable ); } @@ -119,6 +132,9 @@ $this->assertTrue( (bool)preg_match( "/^($regex)$/u", $value ), "Hex: $hex" ); } + /** + * @return array[] Array of arrays of one or two strings: value and optional language code + */ public function provideGetNumberRegexMismatch() { return array( array( '' ), @@ -153,4 +169,5 @@ $this->assertFalse( (bool)preg_match( "/^($regex)$/u", $value ) ); } + } -- To view, visit https://gerrit.wikimedia.org/r/120777 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I754fe953895212d190f40711cec4dd20f71aa19d Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/Wikibase Gerrit-Branch: master Gerrit-Owner: Thiemo Mättig (WMDE) <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
