Jeroen De Dauw has submitted this change and it was merged.

Change subject: Move search key generation to TermSqlIndex.
......................................................................


Move search key generation to TermSqlIndex.

Search key generation is specific to the storage/search engine.
The code was misplaced in the Term class, which required awkward
hacks to get access to the required helper classes / servces.

Change-Id: I8fd9eba0999b1bfcb69dba0bd84723e491f1e012
---
M lib/includes/Term.php
M lib/includes/store/sql/TermSqlIndex.php
M lib/tests/phpunit/TermTest.php
M repo/includes/store/sql/SqlStore.php
M repo/includes/store/sql/TermSearchKeyBuilder.php
M repo/tests/phpunit/includes/store/sql/TermSqlIndexTest.php
6 files changed, 135 insertions(+), 135 deletions(-)

Approvals:
  Jeroen De Dauw: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/lib/includes/Term.php b/lib/includes/Term.php
index 6fba0e7..807b763 100644
--- a/lib/includes/Term.php
+++ b/lib/includes/Term.php
@@ -211,76 +211,6 @@
        }
 
        /**
-        * @since 0.2
-        *
-        * @return string|null
-        */
-       public function getNormalizedText() {
-               $text = $this->getText();
-               $lang = $this->getLanguage();
-               return $text === null? null : self::normalizeText( $text, $lang 
);
-       }
-
-       /**
-        * @since 0.2
-        *
-        * @param string $text
-        * @param string $lang language code of the text's language, may be used
-        *        for specialized normalization.
-        *
-        * @return string
-        *
-        * @todo: Move this to TermSqlIndex
-        */
-       public static function normalizeText( $text, $lang = 'en' ) {
-               if ( $text === '' ) {
-                       return '';
-               }
-
-               //FIXME: move normalizeText to TermSqlIndex to avoid this mess!
-               if ( class_exists( 'Wikibase\Repo\WikibaseRepo' ) ) {
-                       $normalizer = 
WikibaseRepo::getDefaultInstance()->getStringNormalizer();
-               } elseif ( class_exists( 'Wikibase\Client\WikibaseClient' ) ) {
-                       $normalizer = 
WikibaseClient::getDefaultInstance()->getStringNormalizer();
-               } else {
-                       throw new \RuntimeException( "Found nither WikibaseRepo 
not WikibaseClient" );
-               }
-
-               // composed normal form
-               $nfcText = $normalizer->cleanupToNFC( $text );
-
-               if ( !is_string( $nfcText ) || $nfcText === '' ) {
-                       wfWarn( "Unicode normalization failed for `$text`" );
-               }
-
-               // \p{Z} - whitespace
-               // \p{C} - control chars
-               // WARNING: *any* invalid UTF8 sequence causes preg_replace to 
return an empty string.
-               $strippedText = $nfcText;
-               $strippedText = preg_replace( '/[\p{Cc}\p{Cf}\p{Cn}\p{Cs}]+/u', 
' ', $strippedText );
-               $strippedText = preg_replace( '/^[\p{Z}]+|[\p{Z}]+$/u', '', 
$strippedText );
-
-               if ( $strippedText === '' ) {
-                       // NOTE: This happens when there is only whitespace in 
the string.
-                       //       However, preg_replace will also return an 
empty string if it
-                       //       encounters any invalid utf-8 sequence.
-                       return '';
-               }
-
-               //TODO: Use Language::lc to convert to lower case.
-               //      But that requires us to load ALL the language objects,
-               //      which loads ALL the messages, which makes us run out
-               //      of RAM (see bug 41103).
-               $normalized = mb_strtolower( $strippedText, 'UTF-8' );
-
-               if ( !is_string( $normalized ) || $normalized === '' ) {
-                       wfWarn( "mb_strtolower normalization failed for 
`$strippedText`" );
-               }
-
-               return $normalized;
-       }
-
-       /**
         * Returns true if this Term object is equals to $that. This Term 
object is considered
         * equal to $that if $that is also an instance of Term, and 
$that->fields contains the
         * same values for the same fields as $this->fields.
diff --git a/lib/includes/store/sql/TermSqlIndex.php 
b/lib/includes/store/sql/TermSqlIndex.php
index 783d022..f9c10d8 100644
--- a/lib/includes/store/sql/TermSqlIndex.php
+++ b/lib/includes/store/sql/TermSqlIndex.php
@@ -41,6 +41,11 @@
        protected $tableName;
 
        /**
+        * @var StringNormalizer
+        */
+       protected $stringNormalizer;
+
+       /**
         * Maps table fields to TermIndex interface field names.
         *
         * @since 0.2
@@ -58,14 +63,15 @@
        /**
         * Constructor.
         *
-        * @since 0.1
+        * @since    0.4
         *
-        * @param string $tableName
-        * @param string|bool $wikiDb
+        * @param StringNormalizer $stringNormalizer
+        * @param string|bool      $wikiDb
         */
-       public function __construct( $tableName, $wikiDb = false ) {
+       public function __construct( StringNormalizer $stringNormalizer, 
$wikiDb = false ) {
                parent::__construct( $wikiDb );
-               $this->tableName = $tableName;
+               $this->stringNormalizer = $stringNormalizer;
+               $this->tableName = 'wb_terms';
        }
 
        /**
@@ -190,7 +196,7 @@
                );
 
                if ( !Settings::get( 'withoutTermSearchKey' ) ) {
-                       $fields['term_search_key'] = $term->getNormalizedText();
+                       $fields['term_search_key'] = $this->getSearchKey( 
$term->getText(), $term->getLanguage() );
                }
 
                return $fields;
@@ -620,7 +626,7 @@
                                }
                                else {
                                        $textField = 'term_search_key';
-                                       $text = $term->getNormalizedText();
+                                       $text = $this->getSearchKey( 
$term->getText(), $term->getLanguage() );
                                }
 
                                if ( $options['prefixSearch'] ) {
@@ -836,4 +842,56 @@
 
                return $resultTerms;
        }
+
+       /**
+        * @since 0.4
+        *
+        * @param string $text
+        * @param string $lang language code of the text's language, may be used
+        *                     for specialized normalization.
+        *
+        * @return string
+        */
+       public function getSearchKey( $text, $lang = 'en' ) {
+               if ( $text === null ) {
+                       return null;
+               }
+
+               if ( $text === '' ) {
+                       return '';
+               }
+
+               // composed normal form
+               $nfcText = $this->stringNormalizer->cleanupToNFC( $text );
+
+               if ( !is_string( $nfcText ) || $nfcText === '' ) {
+                       wfWarn( "Unicode normalization failed for `$text`" );
+               }
+
+               // \p{Z} - whitespace
+               // \p{C} - control chars
+               // WARNING: *any* invalid UTF8 sequence causes preg_replace to 
return an empty string.
+               $strippedText = $nfcText;
+               $strippedText = preg_replace( '/[\p{Cc}\p{Cf}\p{Cn}\p{Cs}]+/u', 
' ', $strippedText );
+               $strippedText = preg_replace( '/^[\p{Z}]+|[\p{Z}]+$/u', '', 
$strippedText );
+
+               if ( $strippedText === '' ) {
+                       // NOTE: This happens when there is only whitespace in 
the string.
+                       //       However, preg_replace will also return an 
empty string if it
+                       //       encounters any invalid utf-8 sequence.
+                       return '';
+               }
+
+               //TODO: Use Language::lc to convert to lower case.
+               //      But that requires us to load ALL the language objects,
+               //      which loads ALL the messages, which makes us run out
+               //      of RAM (see bug 41103).
+               $normalized = mb_strtolower( $strippedText, 'UTF-8' );
+
+               if ( !is_string( $normalized ) || $normalized === '' ) {
+                       wfWarn( "mb_strtolower normalization failed for 
`$strippedText`" );
+               }
+
+               return $normalized;
+       }
 }
diff --git a/lib/tests/phpunit/TermTest.php b/lib/tests/phpunit/TermTest.php
index 64331eb..9f7ca1c 100644
--- a/lib/tests/phpunit/TermTest.php
+++ b/lib/tests/phpunit/TermTest.php
@@ -71,58 +71,6 @@
                $this->assertEquals( isset( $fields['termText'] ) ? 
$fields['termText'] : null, $term->getText() );
        }
 
-       public static function provideGetNormalizedText() {
-               return array(
-                       array( // #0
-                               'foo', // raw
-                               'foo', // normalized
-                       ),
-
-                       array( // #1
-                               '  foo  ', // raw
-                               'foo', // normalized
-                       ),
-
-                       array( // #2: lower case of non-ascii character
-                               'ÄpFEl', // raw
-                               'äpfel', // normalized
-                       ),
-
-                       array( // #3: lower case of decomposed character
-                               "A\xCC\x88pfel", // raw
-                               'äpfel', // normalized
-                       ),
-
-                       array( // #4: lower case of cyrillic character
-                               'Берлин', // raw
-                               'берлин', // normalized
-                       ),
-
-                       array( // #5: lower case of greek character
-                               'Τάχιστη', // raw
-                               'τάχιστη', // normalized
-                       ),
-
-                       array( // #6: nasty unicode whitespace
-                               // ZWNJ: U+200C \xE2\x80\x8C
-                               // RTLM: U+200F \xE2\x80\x8F
-                               // PSEP: U+2029 \xE2\x80\xA9
-                               
"\xE2\x80\x8F\xE2\x80\x8Cfoo\xE2\x80\x8Cbar\xE2\x80\xA9", // raw
-                               "foo bar", // normalized
-                       ),
-               );
-       }
-
-       /**
-        * @dataProvider provideGetNormalizedText
-        */
-       public function testGetNormalizedText( $raw, $normalized ) {
-               $term = new Term( array() );
-
-               $term->setText( $raw );
-               $this->assertEquals( $normalized, $term->getNormalizedText() );
-       }
-
        public function testClone() {
                $term = new Term( array(
                        'termText' => 'Foo'
diff --git a/repo/includes/store/sql/SqlStore.php 
b/repo/includes/store/sql/SqlStore.php
index b29c3c2..5595d9e 100644
--- a/repo/includes/store/sql/SqlStore.php
+++ b/repo/includes/store/sql/SqlStore.php
@@ -67,7 +67,10 @@
         * @return TermIndex
         */
        protected function newTermIndex() {
-               return new TermSqlIndex( 'wb_terms' );
+               //TODO: Get $stringNormalizer from WikibaseRepo?
+               //      Can't really pass this via the constructor...
+               $stringNormalizer = new StringNormalizer();
+               return new TermSqlIndex( $stringNormalizer );
        }
 
        /**
diff --git a/repo/includes/store/sql/TermSearchKeyBuilder.php 
b/repo/includes/store/sql/TermSearchKeyBuilder.php
index f8fa085..afb4dc9 100644
--- a/repo/includes/store/sql/TermSearchKeyBuilder.php
+++ b/repo/includes/store/sql/TermSearchKeyBuilder.php
@@ -219,9 +219,9 @@
 
        /**
         * Updates a single row with a newley calculated search key.
-        * The search key is calculated using Term::normalizeText().
+        * The search key is calculated using TermSqlIndex::getSearchKey().
         *
-        * @see Term::normalizeText
+        * @see TermSqlIndex::getSearchKey
         *
         * @since 0.4
         *
@@ -233,7 +233,7 @@
         * @return string the search key
         */
        protected function updateSearchKey( \DatabaseBase $dbw, $rowId, $text, 
$lang ) {
-               $key = Term::normalizeText( $text, $lang );
+               $key = $this->table->getSearchKey( $text, $lang );
 
                $dbw->update(
                        $this->table->getTableName(),
diff --git a/repo/tests/phpunit/includes/store/sql/TermSqlIndexTest.php 
b/repo/tests/phpunit/includes/store/sql/TermSqlIndexTest.php
index 7c5cfd3..7aebbac 100644
--- a/repo/tests/phpunit/includes/store/sql/TermSqlIndexTest.php
+++ b/repo/tests/phpunit/includes/store/sql/TermSqlIndexTest.php
@@ -2,6 +2,7 @@
 
 namespace Wikibase\Test;
 use Wikibase\Item;
+use Wikibase\StringNormalizer;
 use Wikibase\Term;
 
 /**
@@ -39,7 +40,8 @@
 class TermSqlIndexTest extends TermIndexTest {
 
        public function getTermIndex() {
-               return new \Wikibase\TermSqlIndex( 'wb_terms' );
+               $normalizer = new StringNormalizer();
+               return new \Wikibase\TermSqlIndex( $normalizer );
        }
 
        public function termProvider() {
@@ -69,7 +71,7 @@
                /**
                 * @var \Wikibase\TermSqlIndex $termIndex
                 */
-               $termIndex = \Wikibase\StoreFactory::getStore( 'sqlstore' 
)->getTermIndex();
+               $termIndex = $this->getTermIndex();
 
                $termIndex->clear();
 
@@ -98,4 +100,63 @@
                        $this->assertEquals( $termText, 
$obtainedTerm->getText() );
                }
        }
+
+       public static function provideGetSearchKey() {
+               return array(
+                       array( // #0
+                               'foo', // raw
+                               'en',  // lang
+                               'foo', // normalized
+                       ),
+
+                       array( // #1
+                               '  foo  ', // raw
+                               'en',  // lang
+                               'foo', // normalized
+                       ),
+
+                       array( // #2: lower case of non-ascii character
+                               'ÄpFEl', // raw
+                               'de',    // lang
+                               'äpfel', // normalized
+                       ),
+
+                       array( // #3: lower case of decomposed character
+                               "A\xCC\x88pfel", // raw
+                               'de',    // lang
+                               'äpfel', // normalized
+                       ),
+
+                       array( // #4: lower case of cyrillic character
+                               'Берлин', // raw
+                               'ru',     // lang
+                               'берлин', // normalized
+                       ),
+
+                       array( // #5: lower case of greek character
+                               'Τάχιστη', // raw
+                               'he',      // lang
+                               'τάχιστη', // normalized
+                       ),
+
+                       array( // #6: nasty unicode whitespace
+                               // ZWNJ: U+200C \xE2\x80\x8C
+                               // RTLM: U+200F \xE2\x80\x8F
+                               // PSEP: U+2029 \xE2\x80\xA9
+                               
"\xE2\x80\x8F\xE2\x80\x8Cfoo\xE2\x80\x8Cbar\xE2\x80\xA9", // raw
+                               'en',      // lang
+                               "foo bar", // normalized
+                       ),
+               );
+       }
+
+       /**
+        * @dataProvider provideGetSearchKey
+        */
+       public function testGetSearchKey( $raw, $lang, $normalized ) {
+               $index = $this->getTermIndex();
+
+               $key = $index->getSearchKey( $raw, $lang );
+               $this->assertEquals( $normalized, $key );
+       }
 }

-- 
To view, visit https://gerrit.wikimedia.org/r/72079
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I8fd9eba0999b1bfcb69dba0bd84723e491f1e012
Gerrit-PatchSet: 5
Gerrit-Project: mediawiki/extensions/Wikibase
Gerrit-Branch: master
Gerrit-Owner: Daniel Kinzler <[email protected]>
Gerrit-Reviewer: Aude <[email protected]>
Gerrit-Reviewer: Daniel Werner <[email protected]>
Gerrit-Reviewer: Jeroen De Dauw <[email protected]>
Gerrit-Reviewer: jenkins-bot

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to