Thiemo Mättig (WMDE) has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/120777

Change subject: Make NumberUnlocalizer aware of incomplete 
$separatorTransformTable
......................................................................

Make NumberUnlocalizer aware of incomplete $separatorTransformTable

Very few (currently five) languages replace only one of the two
separator characters, e.g. by setting
$separatorTransformTable = array( ',' => ' ' );
instead of
$separatorTransformTable = array( ',' => ' ', '.' => '.' );
That's perfectly fine.

Code doing the inverse operation (unlocalizing) shouldn't rely on
the presence of both characters in the array keys. The canonical
characters should simply be hard coded ',' and '.'. There are no
constants for these as far as I can tell.

Same for $digitTransformTable if a language does not replace all
ten digits, e.g. $digitTransformTable = array( '0' => '?' );
We currently don't have a language that does that.

This problem does not occur at wikidata.org currently and probably
will not anyway since the parse API calls miss the "lang" option.

http://localhost/repowiki/api.php?action=wbparsevalue&format=json&parser=quantity&values=1.2&options={"lang":"la"}

Change-Id: I754fe953895212d190f40711cec4dd20f71aa19d
---
M lib/includes/parsers/MediaWikiNumberUnlocalizer.php
M lib/tests/phpunit/parsers/MediaWikiNumberUnlocalizerTest.php
2 files changed, 29 insertions(+), 17 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Wikibase 
refs/changes/77/120777/1

diff --git a/lib/includes/parsers/MediaWikiNumberUnlocalizer.php 
b/lib/includes/parsers/MediaWikiNumberUnlocalizer.php
index 5af1e20..e669e16 100644
--- a/lib/includes/parsers/MediaWikiNumberUnlocalizer.php
+++ b/lib/includes/parsers/MediaWikiNumberUnlocalizer.php
@@ -57,36 +57,31 @@
         * Constructs a regular expression based on 
Language::digitTransformTable()
         * and Language::separatorTransformTable().
         *
-        * @param string $delim the regex delimiter, used for escaping.
+        * @param string $delimiter The regex delimiter, used for escaping.
         *
         * @return string regular expression
         */
-       public function getNumberRegex( $delim = '/' ) {
+       public function getNumberRegex( $delimiter = '/' ) {
                $digitMap = $this->language->digitTransformTable();
                $separatorMap = $this->language->separatorTransformTable();
 
-               if ( empty( $digitMap ) ) {
-                       $numerals = '0123456789';
-               } else {
-                       $numerals = implode( '', array_keys( $digitMap ) ) // 
accept canonical numerals
-                               . implode( '', array_values( $digitMap ) ); // 
...and localized numerals
-               }
+               // Always accept canonical digits and separators
+               $characters = '0123456789,.';
 
-               if ( empty( $separatorMap ) ) {
-                       $separators = '.,';
-               } else {
-                       $separators = implode( '', array_keys( $separatorMap ) 
) // accept canonical separators
-                               . implode( '', array_values( $separatorMap ) ); 
// ...and localized separators
+               // Add localized digits and separators
+               if ( is_array( $digitMap ) ) {
+                       $characters .= implode( '', array_values( $digitMap ) );
                }
-
-               $characters = $numerals . $separators;
+               if ( is_array( $separatorMap ) ) {
+                       $characters .= implode( '', array_values( $separatorMap 
) );
+               }
 
                // if any whitespace characters are acceptable, also accept a 
regular blank.
                if ( preg_match( '/\s/u', $characters ) ) {
-                       $characters = $characters . ' ';
+                       $characters .= ' ';
                }
 
-               return '[-+]?[' . preg_quote( $characters, $delim ) . ']+';
+               return '[-+]?[' . preg_quote( $characters, $delimiter ) . ']+';
        }
 
 }
diff --git a/lib/tests/phpunit/parsers/MediaWikiNumberUnlocalizerTest.php 
b/lib/tests/phpunit/parsers/MediaWikiNumberUnlocalizerTest.php
index 68c7313..8c5e8fd 100644
--- a/lib/tests/phpunit/parsers/MediaWikiNumberUnlocalizerTest.php
+++ b/lib/tests/phpunit/parsers/MediaWikiNumberUnlocalizerTest.php
@@ -19,6 +19,10 @@
  */
 class MediaWikiNumberUnlocalizerTest extends \PHPUnit_Framework_TestCase {
 
+       /**
+        * @return array[] Array of arrays of three strings: localized value, 
language code and expected
+        * canonical value
+        */
        public function provideUnlocalize() {
                return array(
                        array( '1', 'en', '1' ),
@@ -47,6 +51,10 @@
                $this->assertEquals( $canonical, $unlocalized );
        }
 
+       /**
+        * @return array[] Array of arrays of two or three values: number, 
language code and optional
+        * expected canonical value
+        */
        public function provideLocalizationRoundTrip() {
                $numbers = array( 12, -4.111, 12345678 );
                $languages = array(
@@ -83,6 +91,9 @@
                $this->assertEquals( $canonical, $unlocalized );
        }
 
+       /**
+        * @return array[] Array of arrays of one or two strings: value and 
optional language code
+        */
        public function provideGetNumberRegexMatch() {
                return array(
                        array( '5' ),
@@ -104,6 +115,8 @@
                        array( '12.345,77', 'de' ),
                        array( "12\xc2\xa0345,77", 'sv' ), // non-breaking 
space, as generated by the formatter
                        array( "12 345,77", 'sv' ), // regular space, as might 
be entered by users
+
+                       array( "1\xc2\xa0234.56", 'la' ), // incomplete 
separatorTransformTable
                );
        }
 
@@ -119,6 +132,9 @@
                $this->assertTrue( (bool)preg_match( "/^($regex)$/u", $value ), 
"Hex: $hex" );
        }
 
+       /**
+        * @return array[] Array of arrays of one or two strings: value and 
optional language code
+        */
        public function provideGetNumberRegexMismatch() {
                return array(
                        array( '' ),
@@ -153,4 +169,5 @@
 
                $this->assertFalse( (bool)preg_match( "/^($regex)$/u", $value ) 
);
        }
+
 }

-- 
To view, visit https://gerrit.wikimedia.org/r/120777
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I754fe953895212d190f40711cec4dd20f71aa19d
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Wikibase
Gerrit-Branch: master
Gerrit-Owner: Thiemo Mättig (WMDE) <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to