https://www.mediawiki.org/wiki/Special:Code/MediaWiki/103327
Revision: 103327
Author: hashar
Date: 2011-11-16 15:12:00 +0000 (Wed, 16 Nov 2011)
Log Message:
-----------
bug 28643 improvement to serbian variants conversion
This patch is a PARTIAL merge of /branches/nikola/phase3 ::
r85224 avoid double conversion when text already use the correct variant
r85239 minor fixes to previous
r85308 documentation (@since 1.18 update to 1.19)
r101359 guessVariant doc + boolean typecast
r101369 tests
r103131 additional test
Test plan:
==========
$ ./phpunit.php --filter LanguageSr
PHPUnit 3.6.3 by Sebastian Bergmann.
Configuration read from /srv/trunk/tests/phpunit/suite.xml
.....
Time: 1 second, Memory: 78.50Mb
OK (5 tests, 19 assertions)
$
Modified Paths:
--------------
trunk/phase3/RELEASE-NOTES-1.19
trunk/phase3/languages/LanguageConverter.php
trunk/phase3/languages/classes/LanguageSr.php
Added Paths:
-----------
trunk/phase3/tests/phpunit/languages/LanguageSrTest.php
Property Changed:
----------------
trunk/phase3/languages/LanguageConverter.php
Modified: trunk/phase3/RELEASE-NOTES-1.19
===================================================================
--- trunk/phase3/RELEASE-NOTES-1.19 2011-11-16 14:33:57 UTC (rev 103326)
+++ trunk/phase3/RELEASE-NOTES-1.19 2011-11-16 15:12:00 UTC (rev 103327)
@@ -166,6 +166,7 @@
* (bug 30217) Make pt-br a fallback of pt.
* (bug 31193) Set fallback language of Assamese from Bengali to English.
* Update date format for dsb and hsb: month names need the genitive.
+* (bug 28643) Serbian variant conversion improvements (Nikola Smolenski)
=== Other changes in 1.19 ===
* jquery.mwPrototypes module was renamed to jquery.mwExtension.
Modified: trunk/phase3/languages/LanguageConverter.php
===================================================================
--- trunk/phase3/languages/LanguageConverter.php 2011-11-16 14:33:57 UTC
(rev 103326)
+++ trunk/phase3/languages/LanguageConverter.php 2011-11-16 15:12:00 UTC
(rev 103327)
@@ -322,6 +322,10 @@
}
}
+ if( $this->guessVariant( $text, $toVariant ) ) {
+ return $text;
+ }
+
/* we convert everything except:
1. HTML markups (anything between < and >)
2. HTML entities
@@ -571,7 +575,7 @@
*/
public function convertTo( $text, $variant ) {
global $wgDisableLangConversion;
- if ( $wgDisableLangConversion ) {
+ if ( $wgDisableLangConversion || $this->guessVariant( $text,
$variant ) ) {
return $text;
}
return $this->recursiveConvertTopLevel( $text, $variant );
@@ -773,6 +777,20 @@
}
/**
+ * Guess if a text is written in a variant. This should be implemented
in subclasses.
+ *
+ * @param string $text the text to be checked
+ * @param string $variant language code of the variant to be
checked for
+ * @return bool true if $text appears to be written in $variant, false
if not
+ *
+ * @author Nikola Smolenski <[email protected]>
+ * @since 1.19
+ */
+ public function guessVariant($text, $variant) {
+ return false;
+ }
+
+ /**
* Load default conversion tables.
* This method must be implemented in derived class.
*
Property changes on: trunk/phase3/languages/LanguageConverter.php
___________________________________________________________________
Added: svn:mergeinfo
+ /branches/REL1_15/phase3/languages/LanguageConverter.php:51646
/branches/REL1_17/phase3/languages/LanguageConverter.php:81445,81448
/branches/new-installer/phase3/languages/LanguageConverter.php:43664-66004
/branches/nikola/phase3/languages/LanguageConverter.php:85106-103326
/branches/sqlite/languages/LanguageConverter.php:58211-58321
Modified: trunk/phase3/languages/classes/LanguageSr.php
===================================================================
--- trunk/phase3/languages/classes/LanguageSr.php 2011-11-16 14:33:57 UTC
(rev 103326)
+++ trunk/phase3/languages/classes/LanguageSr.php 2011-11-16 15:12:00 UTC
(rev 103327)
@@ -173,6 +173,32 @@
return $ret;
}
+
+ /**
+ * Guess if a text is written in Cyrillic or Latin.
+ * Overrides LanguageConverter::guessVariant()
+ *
+ * @param string $text The text to be checked
+ * @param string $variant Language code of the variant to be checked
for
+ * @return bool true if $text appears to be written in $variant
+ *
+ * @author Nikola Smolenski <[email protected]>
+ * @since 1.19
+ */
+ public function guessVariant( $text, $variant ) {
+ $numCyrillic = preg_match_all("/[шђчћжШЂЧЋЖ]/u", $text, $dummy);
+ $numLatin = preg_match_all("/[šđč枊ĐČĆŽ]/u", $text, $dummy);
+
+ if( $variant == 'sr-ec' ) {
+ return (boolean) ($numCyrillic > $numLatin);
+ } else if( $variant == 'sr-el' ) {
+ return (boolean) ($numLatin > $numCyrillic);
+ } else {
+ return false;
+ }
+
+ }
+
}
/**
Copied: trunk/phase3/tests/phpunit/languages/LanguageSrTest.php (from rev
103131, branches/nikola/phase3/tests/phpunit/languages/LanguageSrTest.php)
===================================================================
--- trunk/phase3/tests/phpunit/languages/LanguageSrTest.php
(rev 0)
+++ trunk/phase3/tests/phpunit/languages/LanguageSrTest.php 2011-11-16
15:12:00 UTC (rev 103327)
@@ -0,0 +1,165 @@
+<?php
+/**
+ * PHPUnit tests for the Serbian language.
+ * The language can be represented using two scripts:
+ * - Latin (SR_el)
+ * - Cyrillic (SR_ec)
+ * Both representations seems to be bijective, hence MediaWiki can convert
+ * from one script to the other.
+ *
+ * @author Antoine Musso <hashar at free dot fr>
+ * @copyright Copyright © 2011, Antoine Musso <hashar at free dot fr>
+ * @file
+ */
+
+require_once dirname(dirname(__FILE__)). '/bootstrap.php';
+
+/** Tests for MediaWiki languages/LanguageTr.php */
+class LanguageSrTest extends MediaWikiTestCase {
+ /* Language object. Initialized before each test */
+ private $lang;
+
+ function setUp() {
+ $this->lang = Language::factory( 'Sr' );
+ }
+ function tearDown() {
+ unset( $this->lang );
+ }
+
+ ##### TESTS #######################################################
+
+ function testEasyConversions( ) {
+ $this->assertCyrillic(
+ 'шђчћжШЂЧЋЖ',
+ 'Cyrillic guessing characters'
+ );
+ $this->assertLatin(
+ 'šđč枊ĐČĆŽ',
+ 'Latin guessing characters'
+ );
+ }
+
+ function testMixedConversions() {
+ $this->assertCyrillic(
+ 'шђчћжШЂЧЋЖ - šđčćž',
+ 'Mostly cyrillic characters'
+ );
+ $this->assertLatin(
+ 'šđč枊ĐČĆŽ - шђчћж',
+ 'Mostly latin characters'
+ );
+ }
+
+ function testSameAmountOfLatinAndCyrillicGetConverted() {
+ $this->assertConverted(
+ '4 latin: šđčć | 4 cyrillic: шђчћ',
+ 'sr-ec'
+ );
+ $this->assertConverted(
+ '4 latin: šđčć | 4 cyrillic: шђчћ',
+ 'sr-el'
+ );
+ }
+
+ /**
+ * @author Nikola Smolenski
+ */
+ function testConversionToCyrillic() {
+ $this->assertEquals( 'абвг',
+ $this->convertToCyrillic( 'abvg' )
+ );
+ $this->assertEquals( 'абвг',
+ $this->convertToCyrillic( 'абвг' )
+ );
+ $this->assertEquals( 'abvgшђжчћ',
+ $this->convertToCyrillic( 'abvgшђжчћ' )
+ );
+ $this->assertEquals( 'абвгшђжчћ',
+ $this->convertToCyrillic( 'абвгšđžčć' )
+ );
+ //Roman numerals are not converted
+ $this->assertEquals( 'а I б II в III г IV шђжчћ',
+ $this->convertToCyrillic( 'a I b II v III g IV šđžčć' )
+ );
+ }
+
+ function testConversionToLatin() {
+ $this->assertEquals( 'abcd',
+ $this->convertToLatin( 'abcd' )
+ );
+ $this->assertEquals( 'abcd',
+ $this->convertToLatin( 'абцд' )
+ );
+ $this->assertEquals( 'abcdšđžčć',
+ $this->convertToLatin( 'abcdшђжчћ' )
+ );
+ $this->assertEquals( 'абцдšđžčć',
+ $this->convertToLatin( 'абцдšđžčć' )
+ );
+
+ }
+
+ ##### HELPERS #####################################################
+ /**
+ *Wrapper to verify text stay the same after applying conversion
+ * @param $text string Text to convert
+ * @param $variant string Language variant 'sr-ec' or 'sr-el'
+ * @param $msg string Optional message
+ */
+ function assertUnConverted( $text, $variant, $msg = '' ) {
+ $this->assertEquals(
+ $text,
+ $this->convertTo( $text, $variant ),
+ $msg
+ );
+ }
+ /**
+ * Wrapper to verify a text is different once converted to a variant.
+ * @param $text string Text to convert
+ * @param $variant string Language variant 'sr-ec' or 'sr-el'
+ * @param $msg string Optional message
+ */
+ function assertConverted( $text, $variant, $msg = '' ) {
+ $this->assertNotEquals(
+ $text,
+ $this->convertTo( $text, $variant ),
+ $msg
+ );
+ }
+
+ /**
+ * Verifiy the given Cyrillic text is not converted when using
+ * using the cyrillic variant and converted to Latin when using
+ * the Latin variant.
+ */
+ function assertCyrillic( $text, $msg = '' ) {
+ $this->assertUnConverted( $text, 'sr-ec', $msg );
+ $this->assertConverted( $text, 'sr-el', $msg );
+ }
+ /**
+ * Verifiy the given Latin text is not converted when using
+ * using the Latin variant and converted to Cyrillic when using
+ * the Cyrillic variant.
+ */
+ function assertLatin( $text, $msg = '' ) {
+ $this->assertUnConverted( $text, 'sr-el', $msg );
+ $this->assertConverted( $text, 'sr-ec', $msg );
+ }
+
+
+ /** Wrapper for converter::convertTo() method*/
+ function convertTo( $text, $variant ) {
+ return $this
+ ->lang
+ ->mConverter
+ ->convertTo(
+ $text, $variant
+ );
+ }
+ function convertToCyrillic( $text ) {
+ return $this->convertTo( $text, 'sr-ec' );
+ }
+ function convertToLatin( $text ) {
+ return $this->convertTo( $text, 'sr-el' );
+ }
+}
_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs