Gergő Tisza has uploaded a new change for review. https://gerrit.wikimedia.org/r/247102
Change subject: [WIP] Add proper {{GRAMMAR}} support for Hungarian suffixes + articles ...................................................................... [WIP] Add proper {{GRAMMAR}} support for Hungarian suffixes + articles Change-Id: I1d21efec1b4160b9005e6788e0b03dafe2f0749a --- M languages/classes/LanguageHu.php M resources/src/mediawiki.language/languages/hu.js M tests/phpunit/languages/classes/LanguageHuTest.php A tests/qunit/suites/resources/mediawiki.language/languages/hu.js 4 files changed, 503 insertions(+), 22 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/core refs/changes/02/247102/1 diff --git a/languages/classes/LanguageHu.php b/languages/classes/LanguageHu.php index bbf3b05..6b80faf 100644 --- a/languages/classes/LanguageHu.php +++ b/languages/classes/LanguageHu.php @@ -27,26 +27,208 @@ * @ingroup Language */ class LanguageHu extends Language { + protected static $vowelsBack = array( 'a', 'á', 'o', 'ó', 'u', 'ú' ); + protected static $vowelsFrontIllabial = array( 'e', 'é', 'i', 'í' ); + protected static $vowelsFrontLabial = array( 'ö', 'ő', 'ü', 'ű' ); + protected static $digraphs = array( 'cs', 'dz', 'gy', 'ly', 'ny', 'sz', 'zs' ); /** - * @param string $word - * @param string $case + * Callback for {{GRAMMAR:<type>|<param>|...}} + * See other functions for documentation of each type: + * - suffix: {@link addSuffix() addSuffix} + * - article: {@link getArticle() getArticle} + * - rol/ba/k: {@link addSuffixBC() addSuffixBC} + * + * @param string $type + * @param string $param1 + * @param string $param2 + * @param string $param3 + * @param string $param4 * @return string */ - function convertGrammar( $word, $case ) { - global $wgGrammarForms; - if ( isset( $wgGrammarForms[$this->getCode()][$case][$word] ) ) { - return $wgGrammarForms[$this->getCode()][$case][$word]; + public function convertGrammar( $type, $param1, $param2 = null, $param3 = null, $param4 = null ) { + switch ( $type ) { + case 'suffix': + return $this->addSuffix( $param1, $param2, $param3, $param4 ); + case 'article': + return $this->getArticle( $param1 ); + default: + return $this->addSuffixBC( $type, $param1 ); + } + } + + /** + * Combine word (presumably a noun) with suffix according to Hungarian grammar. + * Takes vowel harmony and assimilation into account - for details see: + * - https://en.wikipedia.org/wiki/Vowel_harmony#Hungarian + * - https://en.wikipedia.org/wiki/Hungarian_phonology#Vowel_harmony + * - https://en.wikipedia.org/wiki/Hungarian_noun_phrase#Case_endings + * + * This is far from perfect (and some of the rules are not algorithmizable at all) + * but this function should work with the suffixes following {{SITENAME}} in the + * interface messages, unless sitename is some tricky compound or foreign word. + * + * The function does three things: + * 1) select the suffix with matching vowel harmony (first parameter should be back, + * second front, third rounded (labial); second and third might be omitted if the + * suffix has less forms). + * 2) if the last letter of the word is 'a', 'e' or 'o', change it to 'á', 'é' or 'ó' + * respectively. + * 3) if the first letter of the suffix is 'v', change it according to assimilation + * rules. (This can get complicated if the last letter of the word is a + * digraph/trigraph or a double consonant.) + * + * @param string $word + * @param string $backSuffix The variant with back vowel (or the suffix, if it has + * no variants) + * @param string $frontSuffix The variant with front vowel (illabial front vowel if + * there are three forms). + * @param string $labialSuffix The variant with labial front vowel + * @return string + */ + protected function addSuffix( $word, $backSuffix, $frontSuffix = null, $labialSuffix = null ) { + $word = trim($word); + $backSuffix = trim( $backSuffix ); + $frontSuffix = trim( $frontSuffix ); + $labialSuffix = trim( $labialSuffix ); + + $vowels = array_merge( static::$vowelsBack, static::$vowelsFrontIllabial, + static::$vowelsFrontLabial ); + + // calculate vowel harmony + get last vowel + if ( strtolower( mb_substr( $word, -4 ) ) === 'wiki' ) { + // there is no way to handle compund words in general, but + // special-case "somethingwiki" as it's a frequent sitename + $lastVowel = 'i'; + $vowelHarmony = 'front'; + } else { + $hasBackVowel = $hasFrontVowel = $lastVowel = false; + foreach ( preg_split( '//u', $word, -1, PREG_SPLIT_NO_EMPTY ) as $char ) { + if ( $char === ' ' || $char === '-' || $char === '–' ) { + // poor man's word split + $hasBackVowel = $hasFrontVowel = $lastVowel = false; + continue; + } elseif ( !in_array( $char, $vowels, true ) ) { + continue; + } + $lastVowel = $char; + if ( in_array( $char, self::$vowelsBack, true ) ) { + $hasBackVowel = true; + } else { + $hasFrontVowel = true; + } + } + + if ( !$lastVowel ) { + // Hungarian has no vowelless words; this is some kind of mistake + return ''; + } + + if ( $hasBackVowel && $hasFrontVowel ) { + $vowelHarmony = 'mixed'; + } elseif ( $hasBackVowel ) { + $vowelHarmony = 'back'; + } else { + $vowelHarmony = 'front'; + } } - switch ( $case ) { - case 'rol': - return $word . 'ról'; - case 'ba': - return $word . 'ba'; - case 'k': - return $word . 'k'; + // select suffix that matches vowel harmony + if ( !$frontSuffix ) { + $suffix = $backSuffix; + } elseif ( $vowelHarmony === 'back' ) { + $suffix = $backSuffix; + } elseif ( $vowelHarmony === 'front' ) { + if ( $labialSuffix && in_array( $lastVowel, static::$vowelsFrontLabial, true ) ) { + $suffix = $labialSuffix; + } else { + $suffix = $frontSuffix; + } + } else { // $vowelHarmony === 'mixed' + if ( in_array( $lastVowel, static::$vowelsBack, true ) ) { + $suffix = $backSuffix; + } elseif ( in_array( $lastVowel, static::$vowelsFrontIllabial, true ) ) { + $suffix = $backSuffix; + } else { // $lastVowel in $vowelsFrontLabial + $suffix = $labialSuffix ?: $frontSuffix; + } } - return ''; + + // change word-ending vowel + $lastCharacter = mb_substr( $word, -1 ); + $wordEndVowelReplacements = array( 'a' => 'á', 'e' => 'é', 'o' => 'ó' ); + if ( array_key_exists( $lastCharacter, $wordEndVowelReplacements ) ) { + $word = mb_substr( $word, 0, -1 ) . $wordEndVowelReplacements[$lastCharacter]; + } + + $lastCharacter = mb_substr( $word, -1 ); + $lastTwoCharacters = mb_substr( $word, -2 ); + + // change start of suffix: v assimilates if the word ends with a consonant + if ( mb_substr( $suffix, 0, 1 ) === 'v' && !in_array( $lastCharacter, $vowels, true ) ) { + if ( $lastTwoCharacters === $lastCharacter . $lastCharacter ) { + // long consonant, does not get any longer + $suffix = mb_substr( $suffix, 1 ); + } elseif ( in_array( $lastTwoCharacters, static::$digraphs, true ) ) { + if ( mb_substr( $word, -2, 1 ) === mb_substr( $word, -3, 1 ) ) { + // long digraph, does not get longer + $suffix = mb_substr( $suffix, 1 ); + } else { + // single digraph, will become long now + $digraph = mb_substr( $word, -2, 1 ) . mb_substr( $word, -2 ); + $suffix = mb_substr( $suffix, 1 ); + $word = mb_substr( $word, 0, -2 ) . $digraph; + } + } else { + // single character, will become double now + // (no trigraph check needed since no word ends with the trigraph) + $suffix = mb_substr( $word, -1 ) . mb_substr( $suffix, 1 ); + } + // leave out first character of the suffix if its a vowel and the word also ends with a vowel + } elseif ( + in_array( $lastCharacter, $vowels, true ) + && in_array( mb_substr( $suffix, 0, 1), $vowels, true ) + ) { + $suffix = mb_substr( $suffix, 1 ); + } + + return $word . $suffix; + } + + /** + * B/C wrapper for the old suffix syntax. Unlike the old logic, this actually works. + */ + protected function addSuffixBC( $type, $word ) { + global $wgGrammarForms; + if ( isset( $wgGrammarForms[$this->getCode()][$type][$word] ) ) { + return $wgGrammarForms[$this->getCode()][$type][$word]; + } + + switch ( $type ) { + case 'rol': + return $this->addSuffix( $word, 'ról', 'ről' ); + case 'ba': + return $this->addSuffix( $word, 'ba', 'be' ); + case 'k': + return $this->addSuffix( $word, 'k' ); + } + return $word; + } + + /** + * Returns the definite article "a"/"az" in the form that's appropriate for this word. + * @param string $word + */ + protected function getArticle( $word ) { + $word = trim( $word ); + $vowels = array_merge( static::$vowelsBack, static::$vowelsFrontIllabial, + static::$vowelsFrontLabial ); + + if ( !strlen( $word ) ) { + return ''; + } + + return in_array( $word[0], $vowels, true ) ? 'az' : 'a'; } } + diff --git a/resources/src/mediawiki.language/languages/hu.js b/resources/src/mediawiki.language/languages/hu.js index 4f8f74d..75dcf21 100644 --- a/resources/src/mediawiki.language/languages/hu.js +++ b/resources/src/mediawiki.language/languages/hu.js @@ -1,23 +1,184 @@ /*! * Hungarian language functions - * @author Santhosh Thottingal + * @author Tisza Gergő */ -mediaWiki.language.convertGrammar = function ( word, form ) { +mediaWiki.language.vowelsBack = [ 'a', 'á', 'o', 'ó', 'u', 'ú' ]; +mediaWiki.language.vowelsFrontIllabial = [ 'e', 'é', 'i', 'í' ]; +mediaWiki.language.vowelsFrontLabial = [ 'ö', 'ő', 'ü', 'ű' ]; +mediaWiki.language.vowels = mediaWiki.language.vowelsBack + .concat( mediaWiki.language.vowelsFrontIllabial ) + .concat( mediaWiki.language.vowelsFrontLabial ); +mediaWiki.language.digraphs = [ 'cs', 'dz', 'gy', 'ly', 'ny', 'sz', 'zs' ]; + +/** + * Callback for {{GRAMMAR:<type>|<param>|...}} + * For detailed documentation see the PHP function: + * @see LanguageHu::convertGrammar() + * @param {String} type + * @param {String} param1 + * @param {String} [param2] + * @param {String} [param3] + * @param {String} [param4] + * @return {String} + */ +mediaWiki.language.convertGrammar = function ( + type, param1, param2, param3, param4 +) { + switch ( type ) { + case 'suffix': + return this.addSuffix( param1, param2, param3, param4 ); + case 'article': + return this.getArticle( param1 ); + default: + return this.addSuffixBC( param1, param2 ); + } +} + +/** + * PHP-like substr() that handles negative arguments (most browsers do but IE doesn't). + * Makes comparing the JS and PHP implementation a little less tedious. + * @private + * @param {String} word + * @param {number} start Position of first character (from end of string if negative) + * @oaram {number} [length] Number of characters + */ +mediaWiki.language.substr = function ( word, start, length ) { + if ( start < 0 ) { + start = word.length - start; + } + return word.substr( start, length ); +} + +/** + * Combine word (presumably a noun) with suffix according to Hungarian grammar. + * For detailed documentation see the PHP function: + * @see LanguageHu::addSuffix() + * @param {String} word + * @param {String} backSuffix + * @param {String} [frontSuffix] + * @param {String} [labialSuffix] + */ +mediaWiki.language.addSuffix = function ( word, backSuffix, frontSuffix, labialSuffix ) { + var i, vowelHarmony, lastVowel, hasBackVowel, hasFrontVowel, suffix, + wordEndVowelReplacements, lastCharacter, lastTwoCharacters, digraph; + + // calculate vowel harmony + get last vowel + if ( this.substr( word, -4 ).toLowerCase() === 'wiki' ) { + lastVowel = 'i'; + vowelHarmony = 'front'; + } else { + for ( i = 0; i < word.length; i++ ) { + if ( word[i] === ' ' || word[i] === '-' || word[i] === '–' ) { + hasBackVowel = hasFrontVowel = lastVowel = undefined; + continue; + } else if ( !$.inArray( word[i], this.vowels ) ) { + continue; + } + lastVowel = word[i]; + if ( $.inArray( lastVowel, this.vowelsBack ) ) { + hasBackVowel = true; + } else { + hasFrontVowel = true; + } + } + + if ( !lastVowel ) { + return ''; + } + + if ( hasBackVowel && hasFrontVowel ) { + vowelHarmony = 'mixed'; + } else if ( hasBackVowel ) { + vowelHarmony = 'back'; + } else { + vowelHarmony = 'front'; + } + } + + // select suffix that matches vowel harmony + if ( !frontSuffix ) { + suffix = backSuffix; + } else if ( vowelHarmony === 'back' ) { + suffix = backSuffix; + } else if ( vowelHarmony === 'front' ) { + if ( labialSuffix && $.inArray( lastVowel, this.vowelsFrontLabial ) ) { + suffix = labialSuffix; + } else { + suffix = frontSuffix; + } + } else { // $vowelHarmony === 'mixed' + if ( $.inArray( lastVowel, this.vowelsBack ) ) { + suffix = backSuffix; + } else if ( $.inArray( lastVowel, this.vowelsFrontIllabial ) ) { + suffix = backSuffix; + } else { // lastVowel in vowelsFrontLabial + suffix = labialSuffix || frontSuffix; + } + } + + // change word-ending vowel + lastCharacter = this.substr( word, -1 ); + wordEndVowelReplacements = { 'a': 'á', 'e': 'é', 'o': 'ó' }; + if ( lastCharacter in wordEndVowelReplacements ) { + word = this.substr( word, 0, -1 ) + wordEndVowelReplacements[lastCharacter]; + } + + lastCharacter = this.substr( word, -1 ); + lastTwoCharacters = this.substr( word, -2 ); + + // change start of suffix: v assimilates if the word ends with a consonant + if ( this.substr( suffix, 0, 1 ) === 'v' && !$.inArray( lastCharacter, this.vowels ) ) { + if ( lastTwoCharacters === lastCharacter + lastCharacter ) { + suffix = this.substr( suffix, 1 ); + } else if ( $.inArray( lastTwoCharacters, this.digraphs ) ) { + if ( this.substr( word, -2, 1 ) === this.substr( word, -3, 1 ) ) { + suffix = this.substr( suffix, 1 ); + } else { + digraph = this.substr( word, -2, 1 ) + this.substr( word, -2 ); + suffix = this.substr( suffix, 1 ); + word = this.substr( word, 0, -2 ) + digraph; + } + } else { + suffix = this.substr( word, -1 ) . this.substr( suffix, 1 ); + } + // leave out first character of the suffix if its a vowel and the word also ends with a vowel + } else if ( + $.inArray( lastCharacter, this.vowels ) + && $.inArray( this.substr( suffix, 0, 1), this.vowels ) + ) { + suffix = this.substr( suffix, 1 ); + } + + return word + suffix; +} + +/** + * B/C wrapper for the old suffix syntax. + * @param {String} word + * @param {String} form + */ +mediaWiki.language.addSuffixBC = function ( word, form ) { var grammarForms = mediaWiki.language.getData( 'hu', 'grammarForms' ); if ( grammarForms && grammarForms[ form ] ) { return grammarForms[ form ][ word ]; } switch ( form ) { case 'rol': - word += 'ról'; - break; + return this.addSuffix( word, 'ról', 'ről' ); case 'ba': - word += 'ba'; - break; + return this.addSuffix( word, 'ba', 'be' ); case 'k': - word += 'k'; - break; + return this.addSuffix( word, 'k' ); } return word; }; + +/** + * Returns the definite article "a"/"az" in the form that's appropriate for this word. + * @param {String} word + */ +mediaWiki.language.getArticle = function ( word ) { + return $.inArray( word, mediaWiki.language.vowels ) ? 'az' : 'a'; +} + diff --git a/tests/phpunit/languages/classes/LanguageHuTest.php b/tests/phpunit/languages/classes/LanguageHuTest.php index ee9197d..e9ae871 100644 --- a/tests/phpunit/languages/classes/LanguageHuTest.php +++ b/tests/phpunit/languages/classes/LanguageHuTest.php @@ -32,4 +32,72 @@ array( 'other', 200 ), ); } + + /** + * @dataProvider provideArticle + * @covers LanguageHu::getArticle + */ + public function testGetArticle( $expectedArticle, $word ) { + $lang = TestingAccessWrapper::newFromObject( $this->getLang() ); + $actualArticle = $lang->getArticle( $word ); + $this->assertEquals( $expectedArticle, $actualArticle ); + } + + public function provideArticle() { + return array( + array( 'a', 'ház' ), + array( 'az', 'ajtó' ), + ); + } + + /** + * @dataProvider provideSuffix + * @covers LanguageHu::addSuffix + */ + public function testAddSuffix( $expectedWord, $word, $backSuffix, $frontSuffix, $labialSuffix ) { + $lang = TestingAccessWrapper::newFromObject( $this->getLang() ); + $actualWord = $lang->addSuffix( $word, $backSuffix, $frontSuffix, $labialSuffix ); + $this->assertEquals( $expectedWord, $actualWord ); + } + + public function provideSuffix() { + return array( + array( 'fát', 'fa', 't', null, null ), + array( 'oldalnak', 'oldal', 'nak', 'nek', null ), + array( 'embernek', 'ember', 'nak', 'nek', null ), + array( 'sofőrnek', 'sofőr', 'nak', 'nek', null ), + array( 'oldalhoz', 'oldal', 'hoz', 'hez', 'höz' ), + array( 'emberhez', 'ember', 'hoz', 'hez', 'höz' ), + array( 'sofőrhöz', 'sofőr', 'hoz', 'hez', 'höz' ), + array( 'főnökhöz', 'főnök', 'hoz', 'hez', 'höz' ), + array( 'haverhoz', 'haver', 'hoz', 'hez', 'höz' ), + array( 'oldallal', 'oldal', 'val', 'vel', null ), + array( 'sakkal', 'sakk', 'val', 'vel', null ), + array( 'kéménnyel', 'kémény', 'val', 'vel', null ), + array( 'passzal', 'passz', 'val', 'vel', null ), + array( 'csévével', 'cséve', 'val', 'vel', null ), + array( 'tevén', 'teve', 'on', 'en', 'ön' ), + array( 'ValamilyenWikivel', 'ValamilyenWiki', 'val', 'vel', null ), + ); + } + + /** + * @dataProvider provideConvertGrammar + * @covers LanguageHu::convertGrammar + */ + public function testConvertGrammar( $expected, $type, $param1, $param2, $param3, $param4 ) { + $actual = $this->getLang()->convertGrammar( $type, $param1, $param2, $param3, $param4 ); + $this->assertEquals( $expected, $actual ); + } + + public function provideConvertGrammar() { + return array( + array( 'sofőrről', 'suffix', 'sofőr', 'ról', 'ről', null ), + array( 'a', 'article', 'sofőr', null, null, null ), + array( 'sofőrről', 'rol', 'sofőr', null, null, null ), + array( 'sofőrbe', 'ba', 'sofőr', null, null, null ), + array( 'kaszák', 'k', 'kasza', null, null, null ), + ); + } } + diff --git a/tests/qunit/suites/resources/mediawiki.language/languages/hu.js b/tests/qunit/suites/resources/mediawiki.language/languages/hu.js new file mode 100644 index 0000000..9e1a64f --- /dev/null +++ b/tests/qunit/suites/resources/mediawiki.language/languages/hu.js @@ -0,0 +1,70 @@ +( function ( mw, $ ) { + var data; + + QUnit.module( 'mediawiki.langage.hu' ); + + data = [ + [ 'fát', 'fa', 't', null, null ], + [ 'oldalnak', 'oldal', 'nak', 'nek', null ], + [ 'embernek', 'ember', 'nak', 'nek', null ], + [ 'sofőrnek', 'sofőr', 'nak', 'nek', null ], + [ 'oldalhoz', 'oldal', 'hoz', 'hez', 'höz' ], + [ 'emberhez', 'ember', 'hoz', 'hez', 'höz' ], + [ 'sofőrhöz', 'sofőr', 'hoz', 'hez', 'höz' ], + [ 'főnökhöz', 'főnök', 'hoz', 'hez', 'höz' ], + [ 'haverhoz', 'haver', 'hoz', 'hez', 'höz' ], + [ 'oldallal', 'oldal', 'val', 'vel', null ], + [ 'sakkal', 'sakk', 'val', 'vel', null ], + [ 'kéménnyel', 'kémény', 'val', 'vel', null ], + [ 'passzal', 'passz', 'val', 'vel', null ], + [ 'csévével', 'cséve', 'val', 'vel', null ], + [ 'tevén', 'teve', 'on', 'en', 'ön' ], + [ 'ValamilyenWikivel', 'ValamilyenWiki', 'val', 'vel', null ], + ]; + + QUnit.test( 'addSuffix', data.length, function ( assert ) { + $.each( data, function ( i, row ) { + var expected = row[0], + word = row[1], + backSuffix = row[2], + frontSuffix = row[3], + labialSuffix = row[4]; + assert.strictEqual( expected, mediawiki.language.addSuffix( word, backSuffix, frontSuffix, labialSuffix ) ); + } ); + } ); + + data = [ + [ 'a', 'ház' ], + [ 'az', 'ajtó' ], + ]; + + QUnit.test( 'article', data.length, function ( assert ) { + $.each( data, function ( i, row ) { + var expected = row[0], + word = row[1]; + assert.strictEqual( expected, mediawiki.language.getArticle( word ) ); + } ); + } ); + + data = [ + [ 'sofőrről', 'suffix', 'sofőr', 'ról', 'ről', null ], + [ 'a', 'article', 'sofőr', null, null, null ], + [ 'sofőrről', 'rol', 'sofőr', null, null, null ], + [ 'sofőrbe', 'ba', 'sofőr', null, null, null ], + [ 'kaszák', 'k', 'kasza', null, null, null ], + ]; + + QUnit.test( 'convertGrammar', data.length, function ( assert ) { + $.each( data, function ( i, row ) { + var expected = row[0], + type = row[1], + param1 = row[2], + param2 = row[3], + param3 = row[4], + param4 = row[5]; + assert.strictEqual( expected, mediawiki.language.convertGrammar( + type, param1, param2, param3, param4 ) ); + } ); + } ); +} ( mediaWiki, jQuery ) ); + -- To view, visit https://gerrit.wikimedia.org/r/247102 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I1d21efec1b4160b9005e6788e0b03dafe2f0749a Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/core Gerrit-Branch: master Gerrit-Owner: Gergő Tisza <gti...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits