Reedy has uploaded a new change for review. https://gerrit.wikimedia.org/r/52924
Change subject: (bug 43799) create language-specific collations for category sorting ...................................................................... (bug 43799) create language-specific collations for category sorting This allows one to *finally* get articles to be correctly sorted on category pages for 67 languages based in latin, greek and cyrillic alphabets. Fixes bug 29788, bug 41040, and bug 42412 (implementing collations for Swedish, Polish, Ukrainian). Full list of language codes this adds support for: af, ast, az, be, bg, br, bs, ca, co, cs, cy, da, de, dsb, el, en, eo, es, et, eu, fi, fo, fr, fur, fy, ga, gd, gl, hr, hsb, hu, is, it, kk, kl, ku, ky, la, lb, lt, lv, mk, mo, mt, nl, no, oc, pl, pt, rm, ro, ru, rup, sco, sk, sl, smn, sq, sr, sv, tk, tl, tr, tt, uk, uz, vi. * Include data about first-letter characters for 67 language tailorings. This data was generated from based on http://developer.mimer.com/charts/tailorings.htm by a Ruby script (https://www.mediawiki.org/wiki/User:Matma_Rex/generateCollationTailoringData.rb), then adjusted by hand (removed duplicate definitions for Spanish and German, changed code fil -> tl (Filipino -> Tagalog). * Mark languages verified by native speakers (currently only pl (Polish) I verified by myself and fi (Finnish) checked by Niklas). * Allow for collations named like 'uca-<langcode>', mapping them to IcuCollation with appropriate parameter. The code doesn't check if we actually have data for given language, as it's checked after the IcuCollation class instance is constructed. * Add the tailoring data to the default first-letter file (for root collation) before it's cached for given locale. Change-Id: I838484b9aaf23945fe7880fef2e3da5f5c06877f --- M RELEASE-NOTES-1.21 M includes/Collation.php 2 files changed, 107 insertions(+), 5 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/core refs/changes/24/52924/1 diff --git a/RELEASE-NOTES-1.21 b/RELEASE-NOTES-1.21 index 7e06218..3f7765e 100644 --- a/RELEASE-NOTES-1.21 +++ b/RELEASE-NOTES-1.21 @@ -99,6 +99,18 @@ * WikiText now permits the use of WAI-ARIA's role="presentation" inside of html elements and tables. This allows presentational markup, especially tables. To be marked up as such. +* maintenance/sql.php learned the --cluster option. Let you run the script + on some external cluster instead of the primary cluster for a given wiki. +* (bug 20281) test the parsing of inline URLs. +* Added Special:PagesWithProp, which lists pages using a particular page property. +* Implemented language-specific collations for category sorting for 67 languages + based in latin, greek and cyrillic alphabets. This allows one to *finally* get + articles to be correctly sorted on category pages. They are named + 'uca-<langcode>', where <langcode> is one of: af, ast, az, be, bg, br, bs, ca, + co, cs, cy, da, de, dsb, el, en, eo, es, et, eu, fi, fo, fr, fur, fy, ga, gd, + gl, hr, hsb, hu, is, it, kk, kl, ku, ky, la, lb, lt, lv, mk, mo, mt, nl, no, + oc, pl, pt, rm, ro, ru, rup, sco, sk, sl, smn, sq, sr, sv, tk, tl, tr, tt, uk, + uz, vi. === Bug fixes in 1.21 === * (bug 40353) SpecialDoubleRedirect should support interwiki redirects. diff --git a/includes/Collation.php b/includes/Collation.php index 301904e..f57a14a 100644 --- a/includes/Collation.php +++ b/includes/Collation.php @@ -50,8 +50,12 @@ case 'uca-default': return new IcuCollation( 'root' ); default: - # Provide a mechanism for extensions to hook in. + $match = array(); + if ( preg_match( '/^uca-([a-z-]+)$/', $collationName, $match ) ) { + return new IcuCollation( $match[1] ); + } + # Provide a mechanism for extensions to hook in. $collationObject = null; wfRunHooks( 'Collation::factory', array( $collationName, &$collationObject ) ); @@ -194,6 +198,87 @@ array( 0x2F800, 0x2FA1F ), // CJK Compatibility Ideographs Supplement ); + /** + * Additional characters (or character groups) to be considered first-letters + * + * Generated based on the primary level of Unicode collation tailorings + * available at http://developer.mimer.com/charts/tailorings.htm . + * + * Empty arrays are intended; this signifies that the data for the language is + * available and that there are, in fact, no additional letters to consider. + */ + static $tailoringFirstLetters = array( + // Verified by native speakers + 'pl' => array( "Ą", "Ć", "Ę", "Ł", "Ń", "Ó", "Ś", "Ź", "Ż" ), + 'fi' => array( "Å", "Ä", "Ö" ), + // Not verified, but likely correct + 'af' => array(), + 'ast' => array( "CH", "LL", "Ñ" ), + 'az' => array( "Ç", "Ə", "Ğ", "İ", "Ö", "Ş", "Ü" ), + 'be' => array( "Ё" ), + 'bg' => array(), + 'br' => array( "CH", "C'H" ), + 'bs' => array( "Č", "Ć", "DŽ", "Đ", "LJ", "NJ", "Š", "Ž" ), + 'ca' => array(), + 'co' => array(), + 'cs' => array( "Č", "CH", "Ř", "Š", "Ž" ), + 'cy' => array( "CH", "DD", "FF", "NG", "LL", "PH", "RH", "TH" ), + 'da' => array( "Æ", "Ø", "Å" ), + 'de' => array(), + 'dsb' => array( "Č", "Ć", "DŹ", "Ě", "CH", "Ł", "Ń", "Ŕ", "Š", "Ś", "Ž", "Ź" ), + 'el' => array(), + 'en' => array(), + 'eo' => array( "Ĉ", "Ĝ", "Ĥ", "Ĵ", "Ŝ", "Ŭ" ), + 'es' => array( "Ñ" ), + 'et' => array( "Š", "Ž", "Õ", "Ä", "Ö", "Ü" ), + 'eu' => array( "Ñ" ), + 'fo' => array( "Á", "Ð", "Í", "Ó", "Ú", "Ý", "Æ", "Ø", "Å" ), + 'fr' => array(), + 'fur' => array( "À", "Á", "Â", "È", "Ì", "Ò", "Ù" ), + 'fy' => array(), + 'ga' => array(), + 'gd' => array(), + 'gl' => array( "CH", "LL", "Ñ" ), + 'hr' => array( "Č", "Ć", "DŽ", "Đ", "LJ", "NJ", "Š", "Ž" ), + 'hsb' => array( "Č", "DŹ", "Ě", "CH", "Ł", "Ń", "Ř", "Š", "Ć", "Ž" ), + 'hu' => array( "CS", "DZ", "DZS", "GY", "LY", "NY", "Ö", "SZ", "TY", "Ü", "ZS" ), + 'is' => array( "Á", "Ð", "É", "Í", "Ó", "Ú", "Ý", "Þ", "Æ", "Ö", "Å" ), + 'it' => array(), + 'kk' => array( "Ү", "І" ), + 'kl' => array( "Æ", "Ø", "Å" ), + 'ku' => array( "Ç", "Ê", "Î", "Ş", "Û" ), + 'ky' => array( "Ё" ), + 'la' => array(), + 'lb' => array(), + 'lt' => array( "Č", "Š", "Ž" ), + 'lv' => array( "Č", "Ģ", "Ķ", "Ļ", "Ņ", "Š", "Ž" ), + 'mk' => array(), + 'mo' => array( "Ă", "Â", "Î", "Ş", "Ţ" ), + 'mt' => array( "Ċ", "Ġ", "GĦ", "Ħ", "Ż" ), + 'nl' => array(), + 'no' => array( "Æ", "Ø", "Å" ), + 'oc' => array(), + 'pt' => array(), + 'rm' => array(), + 'ro' => array( "Ă", "Â", "Î", "Ş", "Ţ" ), + 'ru' => array(), + 'rup' => array( "Ă", "Â", "Î", "Ľ", "Ń", "Ş", "Ţ" ), + 'sco' => array(), + 'sk' => array( "Ä", "Č", "CH", "Ô", "Š", "Ž" ), + 'sl' => array( "Č", "Š", "Ž" ), + 'smn' => array( "Á", "Č", "Đ", "Ŋ", "Š", "Ŧ", "Ž", "Æ", "Ø", "Å", "Ä", "Ö" ), + 'sq' => array( "Ç", "DH", "Ë", "GJ", "LL", "NJ", "RR", "SH", "TH", "XH", "ZH" ), + 'sr' => array(), + 'sv' => array( "Å", "Ä", "Ö" ), + 'tk' => array( "Ç", "Ä", "Ž", "Ň", "Ö", "Ş", "Ü", "Ý" ), + 'tl' => array( "Ñ", "NG" ), /* 'fil' in the data source */ + 'tr' => array( "Ç", "Ğ", "İ", "Ö", "Ş", "Ü" ), + 'tt' => array( "Ә", "Ө", "Ү", "Җ", "Ң", "Һ" ), + 'uk' => array( "Ґ", "Ь" ), + 'uz' => array( "CH", "G'", "NG", "O'", "SH" ), + 'vi' => array( "Ă", "Â", "Đ", "Ê", "Ô", "Ơ", "Ư" ), + ); + const RECORD_LENGTH = 14; function __construct( $locale ) { @@ -274,10 +359,15 @@ // Generate data from serialized data file - $letters = wfGetPrecompiledData( "first-letters-{$this->locale}.ser" ); - if ( $letters === false ) { - throw new MWException( "MediaWiki does not support ICU locale " . - "\"{$this->locale}\"" ); + if ( isset ( self::$tailoringFirstLetters[$this->locale] ) ) { + $letters = wfGetPrecompiledData( "first-letters-root.ser" ); + $letters = array_merge( $letters, self::$tailoringFirstLetters[$this->locale] ); + } else { + $letters = wfGetPrecompiledData( "first-letters-{$this->locale}.ser" ); + if ( $letters === false ) { + throw new MWException( "MediaWiki does not support ICU locale " . + "\"{$this->locale}\"" ); + } } // Sort the letters. -- To view, visit https://gerrit.wikimedia.org/r/52924 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I838484b9aaf23945fe7880fef2e3da5f5c06877f Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/core Gerrit-Branch: wmf/1.21wmf10 Gerrit-Owner: Reedy <[email protected]> Gerrit-Reviewer: Matmarex <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
