jenkins-bot has submitted this change and it was merged. Change subject: (bug 61911) use localized number patterns. ......................................................................
(bug 61911) use localized number patterns. When splitting quantitiy strings, use regex patterns derived from MediaWiki language objects. IMPORTANT: needs https://github.com/DataValues/Number/pull/2 (that is, it needs the yet-to-be-released version 0.3 of data-values/numbers). Change-Id: If2345c45b5da77ca83437aa925e4b36631fafd15 --- M composer.json M lib/includes/formatters/MediaWikiNumberLocalizer.php M lib/includes/formatters/WikibaseValueFormatterBuilders.php M lib/includes/parsers/MediaWikiNumberUnlocalizer.php M lib/tests/phpunit/parsers/MediaWikiNumberUnlocalizerTest.php M repo/Wikibase.php 6 files changed, 215 insertions(+), 32 deletions(-) Approvals: Tobias Gritschacher: Looks good to me, approved WikidataJenkins: Verified jenkins-bot: Verified diff --git a/composer.json b/composer.json index b0c52c7..652ffee 100644 --- a/composer.json +++ b/composer.json @@ -27,7 +27,7 @@ "data-values/data-values": "~0.1.0", "data-values/common": "~0.2.0", "data-values/geo": "~0.1.0", - "data-values/number": "~0.2.0", + "data-values/number": "~0.3.0", "data-values/time": "~0.2.0", "data-values/validators": "~0.1.0", "data-values/data-types": "~0.1.0", diff --git a/lib/includes/formatters/MediaWikiNumberLocalizer.php b/lib/includes/formatters/MediaWikiNumberLocalizer.php index dbf162d..14a23b9 100644 --- a/lib/includes/formatters/MediaWikiNumberLocalizer.php +++ b/lib/includes/formatters/MediaWikiNumberLocalizer.php @@ -18,21 +18,29 @@ class MediaWikiNumberLocalizer implements Localizer { /** + * @var Language + */ + protected $language; + + /** + * @param Language $language + */ + public function __construct( Language $language ) { + $this->language = $language; + } + + /** * @see Localizer::localize() * * @since 0.5 * * @param string $number a numeric string - * @param string $language a language code - * @param FormatterOptions $options * * @return string * @throws InvalidArgumentException */ - public function localize( $number, $language, FormatterOptions $options ) { - $language = Language::factory( $language ); - - $localiezdNumber = $language->formatNum( $number ); + public function localizeNumber( $number ) { + $localiezdNumber = $this->language->formatNum( $number ); return $localiezdNumber; } } diff --git a/lib/includes/formatters/WikibaseValueFormatterBuilders.php b/lib/includes/formatters/WikibaseValueFormatterBuilders.php index 9308b15..e0dca5a 100644 --- a/lib/includes/formatters/WikibaseValueFormatterBuilders.php +++ b/lib/includes/formatters/WikibaseValueFormatterBuilders.php @@ -537,7 +537,8 @@ */ protected static function newQuantityFormatter( FormatterOptions $options, $builders ) { //TODO: use a builder for this DecimalFormatter - $localizer = new MediaWikiNumberLocalizer(); + $language = Language::factory( $options->getOption( ValueFormatter::OPT_LANG ) ); + $localizer = new MediaWikiNumberLocalizer( $language ); $decimalFormatter = new DecimalFormatter( $options, $localizer ); return new QuantityFormatter( $decimalFormatter, $options ); } diff --git a/lib/includes/parsers/MediaWikiNumberUnlocalizer.php b/lib/includes/parsers/MediaWikiNumberUnlocalizer.php index 5efcf63..5af1e20 100644 --- a/lib/includes/parsers/MediaWikiNumberUnlocalizer.php +++ b/lib/includes/parsers/MediaWikiNumberUnlocalizer.php @@ -2,8 +2,7 @@ namespace Wikibase\Lib; use Language; -use ValueParsers\ParserOptions; -use ValueParsers\Unlocalizer; +use ValueParsers\BasicUnlocalizer; /** * MediaWikiNumberUnlocalizer @@ -13,21 +12,81 @@ * @license GPL 2+ * @author Daniel Kinzler */ -class MediaWikiNumberUnlocalizer implements Unlocalizer { +class MediaWikiNumberUnlocalizer extends BasicUnlocalizer { + + protected static $unlocalizerMap = array( + "\xe2\x88\x92" => '-', // convert minus (U+2212) to hyphen + "\xe2\x93\x96" => '-', // convert "heavy minus" (U+2796) to hyphen + "\xe2\x93\x95" => '+', // convert "heavy plus" (U+2795) to plus + ); + + /** + * @var Language + */ + protected $language; + + /** + * @param Language $language + */ + public function __construct( Language $language ) { + $this->language = $language; + } /** * @see Unlocalizer::unlocalize() * * @param string $number string to process - * @param string $langCode language code - * @param ParserOptions $options * * @return string unlocalized string */ - public function unlocalize( $number, $langCode, ParserOptions $options ) { - $lang = Language::factory( $langCode ); + public function unlocalizeNumber( $number ) { + $canonicalizedNumber = $this->language->parseFormattedNumber( $number ); - $canonicalizedNumber = $lang->parseFormattedNumber( $number ); + // convert "pretty" characters not covered by parseFormattedNumber + $canonicalizedNumber = strtr( $canonicalizedNumber, self::$unlocalizerMap ); + + // strip any remaining whitespace + $canonicalizedNumber = preg_replace( '/\s/u', '', $canonicalizedNumber ); + return $canonicalizedNumber; } + + /** + * @see Unlocalizer::getNumberRegex() + * + * Constructs a regular expression based on Language::digitTransformTable() + * and Language::separatorTransformTable(). + * + * @param string $delim the regex delimiter, used for escaping. + * + * @return string regular expression + */ + public function getNumberRegex( $delim = '/' ) { + $digitMap = $this->language->digitTransformTable(); + $separatorMap = $this->language->separatorTransformTable(); + + if ( empty( $digitMap ) ) { + $numerals = '0123456789'; + } else { + $numerals = implode( '', array_keys( $digitMap ) ) // accept canonical numerals + . implode( '', array_values( $digitMap ) ); // ...and localized numerals + } + + if ( empty( $separatorMap ) ) { + $separators = '.,'; + } else { + $separators = implode( '', array_keys( $separatorMap ) ) // accept canonical separators + . implode( '', array_values( $separatorMap ) ); // ...and localized separators + } + + $characters = $numerals . $separators; + + // if any whitespace characters are acceptable, also accept a regular blank. + if ( preg_match( '/\s/u', $characters ) ) { + $characters = $characters . ' '; + } + + return '[-+]?[' . preg_quote( $characters, $delim ) . ']+'; + } + } diff --git a/lib/tests/phpunit/parsers/MediaWikiNumberUnlocalizerTest.php b/lib/tests/phpunit/parsers/MediaWikiNumberUnlocalizerTest.php index a5c79d8..8cbce4e 100644 --- a/lib/tests/phpunit/parsers/MediaWikiNumberUnlocalizerTest.php +++ b/lib/tests/phpunit/parsers/MediaWikiNumberUnlocalizerTest.php @@ -2,10 +2,13 @@ namespace Wikibase\Lib\Test; -use ValueParsers\ParserOptions; +use Language; +use ValueParsers\ValueParser; +use Wikibase\Lib\MediaWikiNumberLocalizer; use Wikibase\Lib\MediaWikiNumberUnlocalizer; /** + * @covers Wikibase\Lib\MediaWikiNumberLocalizer * @covers Wikibase\Lib\MediaWikiNumberUnlocalizer * * @group ValueParsers @@ -19,24 +22,136 @@ public function provideUnlocalize() { return array( - array( '123,456.789', 'en', '123456.789' ), - array( '123.456,789', 'de', '123456.789' ), + array( '1', 'en', '1' ), + array( '-1.1', 'en', '-1.1' ), + + array( '-1.234,56', 'de', '-1234.56' ), + + array( "\xe2\x88\x921.234,56", 'de', '-1234.56' ), + array( "\xe2\x93\x961.234,56", 'de', '-1234.56' ), + array( "\xe2\x93\x951.234,56", 'de', '+1234.56' ), + + array( "1\xc2\xa0234,56", 'sv', '1234.56' ), + array( "1 234,56", 'sv', '1234.56' ), ); } /** * @dataProvider provideUnlocalize - * - * @param $localized - * @param $lang - * @param $expected */ - public function testUnlocalize( $localized, $lang, $expected ) { - $unlocalizer = new MediaWikiNumberUnlocalizer(); - $options = new ParserOptions(); + public function testUnlocalize( $localized, $languageCode, $canonical ) { + $language = Language::factory( $languageCode ); + $unlocalizer = new MediaWikiNumberUnlocalizer( $language ); - $actual = $unlocalizer->unlocalize( $localized, $lang, $options ); + $unlocalized = $unlocalizer->unlocalizeNumber( $localized ); - $this->assertEquals( $expected, $actual ); + $this->assertEquals( $canonical, $unlocalized ); + } + + public function provideLocalizationRoundTrip() { + $numbers = array( 12, -4.111, 12345678 ); + $languages = array( + 'en', 'es', 'pt', 'fr', 'de', 'sv', 'ru', // western arabic numerals, but different separators + 'ar', 'fa', 'my', 'pi', 'ne', 'kn', // different numerals + ); + + $cases = array(); + foreach ( $languages as $lang ) { + foreach ( $numbers as $num ) { + $cases[] = array( $num, $lang ); + } + }; + + return $cases; + } + + /** + * @dataProvider provideLocalizationRoundTrip + */ + public function testLocalizationRoundTrip( $number, $languageCode, $canonical = null ) { + if ( $canonical === null ) { + $canonical = "$number"; + } + + $language = Language::factory( $languageCode ); + + $localizer = new MediaWikiNumberLocalizer( $language ); + $unlocalizer = new MediaWikiNumberUnlocalizer( $language ); + + $localized = $localizer->localizeNumber( $number ); + $unlocalized = $unlocalizer->unlocalizeNumber( $localized ); + + $this->assertEquals( $canonical, $unlocalized ); + } + + public function provideGetNumberRegexMatch() { + return array( + array( '5' ), + array( '+3' ), + array( '-15' ), + + array( '5.3' ), + array( '+3.2' ), + array( '-15.77' ), + + array( '.3' ), + array( '+.2' ), + array( '-.77' ), + + array( '1,335.3' ), + array( '+1,333.2' ), + array( '-1,315.77' ), + + array( '12.345,77', 'de' ), + array( "12\xc2\xa0345,77", 'sv' ), // non-breaking space, as generated by the formatter + array( "12 345,77", 'sv' ), // regular space, as might be entered by users + ); + } + + /** + * @dataProvider provideGetNumberRegexMatch + */ + public function testGetNumberRegexMatch( $value, $lang = 'en' ) { + $lang = Language::factory( $lang ); + $unlocalizer = new MediaWikiNumberUnlocalizer( $lang ); + $regex = $unlocalizer->getNumberRegex(); + + $hex = utf8ToHexSequence( $regex ); + $this->assertTrue( (bool)preg_match( "/^($regex)$/u", $value ), "Hex: $hex" ); + } + + public function provideGetNumberRegexMismatch() { + return array( + array( '' ), + array( ' ' ), + array( '+' ), + array( 'e' ), + + array( '.-' ), + + array( '0x20' ), + array( '2x2' ), + array( 'x2' ), + array( '2x' ), + + array( 'e.' ), + array( '.e' ), + array( '12e' ), + array( 'E17' ), + + array( '+-3' ), + array( '++7' ), + array( '--5' ), + ); + } + + /** + * @dataProvider provideGetNumberRegexMismatch + */ + public function testGetNumberRegexMismatch( $value, $lang = 'en' ) { + $unlocalizer = new MediaWikiNumberUnlocalizer( Language::factory( $lang ) ); + $regex = $unlocalizer->getNumberRegex(); + + $this->assertFalse( (bool)preg_match( "/^($regex)$/u", $value ) ); } } diff --git a/repo/Wikibase.php b/repo/Wikibase.php index c65b39a..86ad1d3 100644 --- a/repo/Wikibase.php +++ b/repo/Wikibase.php @@ -1,4 +1,5 @@ <?php +use ValueParsers\ValueParser; /** * Entry point for the Wikibase Repository extension. @@ -77,10 +78,9 @@ }; $wgValueParsers['quantity'] = function( ValueParsers\ParserOptions $options ) { - $unlocalizer = new Wikibase\Lib\MediaWikiNumberUnlocalizer(); - return new \ValueParsers\QuantityParser( - new \ValueParsers\DecimalParser( $options, $unlocalizer ), - $options ); + $language = Language::factory( $options->getOption( ValueParser::OPT_LANG ) ); + $unlocalizer = new Wikibase\Lib\MediaWikiNumberUnlocalizer( $language); + return new \ValueParsers\QuantityParser( $options, $unlocalizer ); }; $wgValueParsers['bool'] = 'ValueParsers\BoolParser'; -- To view, visit https://gerrit.wikimedia.org/r/116986 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: If2345c45b5da77ca83437aa925e4b36631fafd15 Gerrit-PatchSet: 6 Gerrit-Project: mediawiki/extensions/Wikibase Gerrit-Branch: master Gerrit-Owner: Daniel Kinzler <daniel.kinz...@wikimedia.de> Gerrit-Reviewer: Aude <aude.w...@gmail.com> Gerrit-Reviewer: Daniel Kinzler <daniel.kinz...@wikimedia.de> Gerrit-Reviewer: Thiemo Mättig (WMDE) <thiemo.maet...@wikimedia.de> Gerrit-Reviewer: Tobias Gritschacher <tobias.gritschac...@wikimedia.de> Gerrit-Reviewer: WikidataJenkins <wikidata-servi...@wikimedia.de> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits