MarkAHershberger has submitted this change and it was merged.
Change subject: backport changes in Collation from master
......................................................................
backport changes in Collation from master
This includes:
* I96bbcd73e75f9fc85a5c0b402eae87e5cda2259e
* I976dedfdc651fcc96a2291934924aa40b27f4c2f
* I4bd3d39ec2938a53e2c6728adc48ee6cf9778d74
Change-Id: Ibe25d9ec070720316202f31c429623bdcb840095
---
M includes/Collation.php
A tests/phpunit/includes/CollationTest.php
2 files changed, 177 insertions(+), 3 deletions(-)
Approvals:
MarkAHershberger: Looks good to me, approved
diff --git a/includes/Collation.php b/includes/Collation.php
index a0e1d2d..6bba019 100644
--- a/includes/Collation.php
+++ b/includes/Collation.php
@@ -213,7 +213,9 @@
'pl' => array( "Ą", "Ć", "Ę", "Ł", "Ń", "Ó", "Ś", "Ź", "Ż" ),
'pt' => array(),
'ru' => array(),
+ 'sv' => array( "Å", "Ä", "Ö" ),
'uk' => array( "Ґ", "Ь" ),
+ 'vi' => array( "Ă", "Â", "Đ", "Ê", "Ô", "Ơ", "Ư" ),
// Not verified, but likely correct
'af' => array(),
'ast' => array( "Ch", "Ll", "Ñ" ),
@@ -266,14 +268,11 @@
'smn' => array( "Á", "Č", "Đ", "Ŋ", "Š", "Ŧ", "Ž", "Æ", "Ø",
"Å", "Ä", "Ö" ),
'sq' => array( "Ç", "Dh", "Ë", "Gj", "Ll", "Nj", "Rr", "Sh",
"Th", "Xh", "Zh" ),
'sr' => array(),
- 'sv' => array( "Å", "Ä", "Ö" ),
- '-sv' => array( "Þ" ), // sorted as "th" in Swedish, causing
unexpected output - bug 45446
'tk' => array( "Ç", "Ä", "Ž", "Ň", "Ö", "Ş", "Ü", "Ý" ),
'tl' => array( "Ñ", "Ng" ),
'tr' => array( "Ç", "Ğ", "İ", "Ö", "Ş", "Ü" ),
'tt' => array( "Ә", "Ө", "Ү", "Җ", "Ң", "Һ" ),
'uz' => array( "Ch", "G'", "Ng", "O'", "Sh" ),
- 'vi' => array( "Ă", "Â", "Đ", "Ê", "Ô", "Ơ", "Ư" ),
);
const RECORD_LENGTH = 14;
@@ -396,6 +395,72 @@
}
}
ksort( $letterMap, SORT_STRING );
+ // Remove duplicate prefixes. Basically if something has a
sortkey
+ // which is a prefix of some other sortkey, then it is an
+ // expansion and probably should not be considered a section
+ // header.
+ //
+ // For example 'þ' is sometimes sorted as if it is the letters
+ // 'th'. Other times it is its own primary element. Another
+ // example is '₨'. Sometimes its a currency symbol. Sometimes it
+ // is an 'R' followed by an 's'.
+ //
+ // Additionally an expanded element should always sort directly
+ // after its first element due to they way sortkeys work.
+ //
+ // UCA sortkey elements are of variable length but no collation
+ // element should be a prefix of some other element, so I think
+ // this is safe. See:
+ // *
https://ssl.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm
+ // *
http://site.icu-project.org/design/collation/uca-weight-allocation
+ //
+ // Additionally, there is something called primary compression
to
+ // worry about. Basically, if you have two primary elements that
+ // are more than one byte and both start with the same byte then
+ // the first byte is dropped on the second primary. Additionally
+ // either \x03 or \xFF may be added to mean that the next
primary
+ // does not start with the first byte of the first primary.
+ //
+ // This shouldn't matter much, as the first primary is not
+ // changed, and that is what we are comparing against.
+ //
+ // tl;dr: This makes some assumptions about how icu implements
+ // collations. It seems incredibly unlikely these assumptions
+ // will change, but nonetheless they are assumptions.
+
+ $prev = false;
+ $duplicatePrefixes = array();
+ foreach( $letterMap as $key => $value ) {
+ // Remove terminator byte. Otherwise the prefix
+ // comparison will get hung up on that.
+ $trimmedKey = rtrim( $key, "\0" );
+ if ( $prev === false || $prev === '' ) {
+ $prev = $trimmedKey;
+ // We don't yet have a collation element
+ // to compare against, so continue.
+ continue;
+ }
+
+ // Due to the fact the array is sorted, we only have
+ // to compare with the element directly previous
+ // to the current element (skipping expansions).
+ // An element "X" will always sort directly
+ // before "XZ" (Unless we have "XY", but we
+ // do not update $prev in that case).
+ if ( substr( $trimmedKey, 0, strlen( $prev ) ) ===
$prev ) {
+ $duplicatePrefixes[] = $key;
+ // If this is an expansion, we don't want to
+ // compare the next element to this element,
+ // but to what is currently $prev
+ continue;
+ }
+ $prev = $trimmedKey;
+ }
+ foreach( $duplicatePrefixes as $badKey ) {
+ wfDebug( "Removing '{$letterMap[$badKey]}' from first
letters." );
+ unset( $letterMap[$badKey] );
+ // This code assumes that unsetting does not change
sort order.
+ }
$data = array(
'chars' => array_values( $letterMap ),
'keys' => array_keys( $letterMap ),
diff --git a/tests/phpunit/includes/CollationTest.php
b/tests/phpunit/includes/CollationTest.php
new file mode 100644
index 0000000..c746208
--- /dev/null
+++ b/tests/phpunit/includes/CollationTest.php
@@ -0,0 +1,109 @@
+<?php
+class CollationTest extends MediaWikiLangTestCase {
+ protected function setUp() {
+ parent::setUp();
+ if ( !extension_loaded( 'intl' ) ) {
+ $this->markTestSkipped( 'These tests require intl
extension' );
+ }
+ }
+
+ /**
+ * Test to make sure, that if you
+ * have "X" and "XY", the binary
+ * sortkey also has "X" being a
+ * prefix of "XY". Our collation
+ * code makes this assumption.
+ *
+ * @param $lang String Language code for collator
+ * @param $base String Base string
+ * @param $extended String String containing base as a prefix.
+ *
+ * @dataProvider prefixDataProvider
+ */
+ function testIsPrefix( $lang, $base, $extended ) {
+ $cp = Collator::create( $lang );
+ $cp->setStrength( Collator::PRIMARY );
+ $baseBin = $cp->getSortKey( $base );
+ // Remove sortkey terminator
+ $baseBin = rtrim( $baseBin, "\0" );
+ $extendedBin = $cp->getSortKey( $extended );
+ $this->assertStringStartsWith( $baseBin, $extendedBin, "$base
is not a prefix of $extended" );
+ }
+
+ function prefixDataProvider() {
+ return array(
+ array( 'en', 'A', 'AA' ),
+ array( 'en', 'A', 'AAA' ),
+ array( 'en', 'Д', 'ДЂ' ),
+ array( 'en', 'Д', 'ДA' ),
+ // 'Ʒ' should expand to 'Z ' (note space).
+ array( 'fi', 'Z', 'Ʒ' ),
+ // 'Þ' should expand to 'th'
+ array( 'sv', 't', 'Þ' ),
+ // Javanese is a limited use alphabet, so should have 3
bytes
+ // per character, so do some tests with it.
+ array( 'en', 'ꦲ', 'ꦲꦤ' ),
+ array( 'en', 'ꦲ', 'ꦲД' ),
+ array( 'en', 'A', 'Aꦲ' ),
+ );
+ }
+ /**
+ * Opposite of testIsPrefix
+ *
+ * @dataProvider notPrefixDataProvider
+ */
+ function testNotIsPrefix( $lang, $base, $extended ) {
+ $cp = Collator::create( $lang );
+ $cp->setStrength( Collator::PRIMARY );
+ $baseBin = $cp->getSortKey( $base );
+ // Remove sortkey terminator
+ $baseBin = rtrim( $baseBin, "\0" );
+ $extendedBin = $cp->getSortKey( $extended );
+ $this->assertStringStartsNotWith( $baseBin, $extendedBin,
"$base is a prefix of $extended" );
+ }
+
+ function notPrefixDataProvider() {
+ return array(
+ array( 'en', 'A', 'B' ),
+ array( 'en', 'AC', 'ABC' ),
+ array( 'en', 'Z', 'Ʒ' ),
+ array( 'en', 'A', 'ꦲ' ),
+ );
+ }
+
+ /**
+ * Test correct first letter is fetched.
+ *
+ * @param $collation String Collation name (aka uca-en)
+ * @param $string String String to get first letter of
+ * @param $firstLetter String Expected first letter.
+ *
+ * @dataProvider firstLetterProvider
+ */
+ function testGetFirstLetter( $collation, $string, $firstLetter ) {
+ $col = Collation::factory( $collation );
+ $this->assertEquals( $firstLetter, $col->getFirstLetter(
$string ) );
+ }
+ function firstLetterProvider() {
+ return array(
+ array( 'uppercase', 'Abc', 'A' ),
+ array( 'uppercase', 'abc', 'A' ),
+ array( 'identity', 'abc', 'a' ),
+ array( 'uca-en', 'abc', 'A' ),
+ array( 'uca-en', ' ', ' ' ),
+ array( 'uca-en', 'Êveryone', 'E' ),
+ array( 'uca-vi', 'Êveryone', 'Ê' ),
+ // Make sure thorn is not a first letter.
+ array( 'uca-sv', 'The', 'T' ),
+ array( 'uca-sv', 'Å', 'Å' ),
+ array( 'uca-hu', 'dzsdo', 'Dzs' ),
+ array( 'uca-hu', 'dzdso', 'Dz' ),
+ array( 'uca-hu', 'CSD', 'Cs' ),
+ array( 'uca-root', 'CSD', 'C' ),
+ array( 'uca-fi', 'Ǥ', 'G' ),
+ array( 'uca-fi', 'Ŧ', 'T' ),
+ array( 'uca-fi', 'Ʒ', 'Z' ),
+ array( 'uca-fi', 'Ŋ', 'N' ),
+ );
+ }
+}
--
To view, visit https://gerrit.wikimedia.org/r/58418
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Ibe25d9ec070720316202f31c429623bdcb840095
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/core
Gerrit-Branch: REL1_21
Gerrit-Owner: Matmarex <[email protected]>
Gerrit-Reviewer: MarkAHershberger <[email protected]>
Gerrit-Reviewer: jenkins-bot
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits