Jeroen De Dauw has submitted this change and it was merged.
Change subject: Move search key generation to TermSqlIndex.
......................................................................
Move search key generation to TermSqlIndex.
Search key generation is specific to the storage/search engine.
The code was misplaced in the Term class, which required awkward
hacks to get access to the required helper classes / servces.
Change-Id: I8fd9eba0999b1bfcb69dba0bd84723e491f1e012
---
M lib/includes/Term.php
M lib/includes/store/sql/TermSqlIndex.php
M lib/tests/phpunit/TermTest.php
M repo/includes/store/sql/SqlStore.php
M repo/includes/store/sql/TermSearchKeyBuilder.php
M repo/tests/phpunit/includes/store/sql/TermSqlIndexTest.php
6 files changed, 135 insertions(+), 135 deletions(-)
Approvals:
Jeroen De Dauw: Looks good to me, approved
jenkins-bot: Verified
diff --git a/lib/includes/Term.php b/lib/includes/Term.php
index 6fba0e7..807b763 100644
--- a/lib/includes/Term.php
+++ b/lib/includes/Term.php
@@ -211,76 +211,6 @@
}
/**
- * @since 0.2
- *
- * @return string|null
- */
- public function getNormalizedText() {
- $text = $this->getText();
- $lang = $this->getLanguage();
- return $text === null? null : self::normalizeText( $text, $lang
);
- }
-
- /**
- * @since 0.2
- *
- * @param string $text
- * @param string $lang language code of the text's language, may be used
- * for specialized normalization.
- *
- * @return string
- *
- * @todo: Move this to TermSqlIndex
- */
- public static function normalizeText( $text, $lang = 'en' ) {
- if ( $text === '' ) {
- return '';
- }
-
- //FIXME: move normalizeText to TermSqlIndex to avoid this mess!
- if ( class_exists( 'Wikibase\Repo\WikibaseRepo' ) ) {
- $normalizer =
WikibaseRepo::getDefaultInstance()->getStringNormalizer();
- } elseif ( class_exists( 'Wikibase\Client\WikibaseClient' ) ) {
- $normalizer =
WikibaseClient::getDefaultInstance()->getStringNormalizer();
- } else {
- throw new \RuntimeException( "Found nither WikibaseRepo
not WikibaseClient" );
- }
-
- // composed normal form
- $nfcText = $normalizer->cleanupToNFC( $text );
-
- if ( !is_string( $nfcText ) || $nfcText === '' ) {
- wfWarn( "Unicode normalization failed for `$text`" );
- }
-
- // \p{Z} - whitespace
- // \p{C} - control chars
- // WARNING: *any* invalid UTF8 sequence causes preg_replace to
return an empty string.
- $strippedText = $nfcText;
- $strippedText = preg_replace( '/[\p{Cc}\p{Cf}\p{Cn}\p{Cs}]+/u',
' ', $strippedText );
- $strippedText = preg_replace( '/^[\p{Z}]+|[\p{Z}]+$/u', '',
$strippedText );
-
- if ( $strippedText === '' ) {
- // NOTE: This happens when there is only whitespace in
the string.
- // However, preg_replace will also return an
empty string if it
- // encounters any invalid utf-8 sequence.
- return '';
- }
-
- //TODO: Use Language::lc to convert to lower case.
- // But that requires us to load ALL the language objects,
- // which loads ALL the messages, which makes us run out
- // of RAM (see bug 41103).
- $normalized = mb_strtolower( $strippedText, 'UTF-8' );
-
- if ( !is_string( $normalized ) || $normalized === '' ) {
- wfWarn( "mb_strtolower normalization failed for
`$strippedText`" );
- }
-
- return $normalized;
- }
-
- /**
* Returns true if this Term object is equals to $that. This Term
object is considered
* equal to $that if $that is also an instance of Term, and
$that->fields contains the
* same values for the same fields as $this->fields.
diff --git a/lib/includes/store/sql/TermSqlIndex.php
b/lib/includes/store/sql/TermSqlIndex.php
index 783d022..f9c10d8 100644
--- a/lib/includes/store/sql/TermSqlIndex.php
+++ b/lib/includes/store/sql/TermSqlIndex.php
@@ -41,6 +41,11 @@
protected $tableName;
/**
+ * @var StringNormalizer
+ */
+ protected $stringNormalizer;
+
+ /**
* Maps table fields to TermIndex interface field names.
*
* @since 0.2
@@ -58,14 +63,15 @@
/**
* Constructor.
*
- * @since 0.1
+ * @since 0.4
*
- * @param string $tableName
- * @param string|bool $wikiDb
+ * @param StringNormalizer $stringNormalizer
+ * @param string|bool $wikiDb
*/
- public function __construct( $tableName, $wikiDb = false ) {
+ public function __construct( StringNormalizer $stringNormalizer,
$wikiDb = false ) {
parent::__construct( $wikiDb );
- $this->tableName = $tableName;
+ $this->stringNormalizer = $stringNormalizer;
+ $this->tableName = 'wb_terms';
}
/**
@@ -190,7 +196,7 @@
);
if ( !Settings::get( 'withoutTermSearchKey' ) ) {
- $fields['term_search_key'] = $term->getNormalizedText();
+ $fields['term_search_key'] = $this->getSearchKey(
$term->getText(), $term->getLanguage() );
}
return $fields;
@@ -620,7 +626,7 @@
}
else {
$textField = 'term_search_key';
- $text = $term->getNormalizedText();
+ $text = $this->getSearchKey(
$term->getText(), $term->getLanguage() );
}
if ( $options['prefixSearch'] ) {
@@ -836,4 +842,56 @@
return $resultTerms;
}
+
+ /**
+ * @since 0.4
+ *
+ * @param string $text
+ * @param string $lang language code of the text's language, may be used
+ * for specialized normalization.
+ *
+ * @return string
+ */
+ public function getSearchKey( $text, $lang = 'en' ) {
+ if ( $text === null ) {
+ return null;
+ }
+
+ if ( $text === '' ) {
+ return '';
+ }
+
+ // composed normal form
+ $nfcText = $this->stringNormalizer->cleanupToNFC( $text );
+
+ if ( !is_string( $nfcText ) || $nfcText === '' ) {
+ wfWarn( "Unicode normalization failed for `$text`" );
+ }
+
+ // \p{Z} - whitespace
+ // \p{C} - control chars
+ // WARNING: *any* invalid UTF8 sequence causes preg_replace to
return an empty string.
+ $strippedText = $nfcText;
+ $strippedText = preg_replace( '/[\p{Cc}\p{Cf}\p{Cn}\p{Cs}]+/u',
' ', $strippedText );
+ $strippedText = preg_replace( '/^[\p{Z}]+|[\p{Z}]+$/u', '',
$strippedText );
+
+ if ( $strippedText === '' ) {
+ // NOTE: This happens when there is only whitespace in
the string.
+ // However, preg_replace will also return an
empty string if it
+ // encounters any invalid utf-8 sequence.
+ return '';
+ }
+
+ //TODO: Use Language::lc to convert to lower case.
+ // But that requires us to load ALL the language objects,
+ // which loads ALL the messages, which makes us run out
+ // of RAM (see bug 41103).
+ $normalized = mb_strtolower( $strippedText, 'UTF-8' );
+
+ if ( !is_string( $normalized ) || $normalized === '' ) {
+ wfWarn( "mb_strtolower normalization failed for
`$strippedText`" );
+ }
+
+ return $normalized;
+ }
}
diff --git a/lib/tests/phpunit/TermTest.php b/lib/tests/phpunit/TermTest.php
index 64331eb..9f7ca1c 100644
--- a/lib/tests/phpunit/TermTest.php
+++ b/lib/tests/phpunit/TermTest.php
@@ -71,58 +71,6 @@
$this->assertEquals( isset( $fields['termText'] ) ?
$fields['termText'] : null, $term->getText() );
}
- public static function provideGetNormalizedText() {
- return array(
- array( // #0
- 'foo', // raw
- 'foo', // normalized
- ),
-
- array( // #1
- ' foo ', // raw
- 'foo', // normalized
- ),
-
- array( // #2: lower case of non-ascii character
- 'ÄpFEl', // raw
- 'äpfel', // normalized
- ),
-
- array( // #3: lower case of decomposed character
- "A\xCC\x88pfel", // raw
- 'äpfel', // normalized
- ),
-
- array( // #4: lower case of cyrillic character
- 'Берлин', // raw
- 'берлин', // normalized
- ),
-
- array( // #5: lower case of greek character
- 'Τάχιστη', // raw
- 'τάχιστη', // normalized
- ),
-
- array( // #6: nasty unicode whitespace
- // ZWNJ: U+200C \xE2\x80\x8C
- // RTLM: U+200F \xE2\x80\x8F
- // PSEP: U+2029 \xE2\x80\xA9
-
"\xE2\x80\x8F\xE2\x80\x8Cfoo\xE2\x80\x8Cbar\xE2\x80\xA9", // raw
- "foo bar", // normalized
- ),
- );
- }
-
- /**
- * @dataProvider provideGetNormalizedText
- */
- public function testGetNormalizedText( $raw, $normalized ) {
- $term = new Term( array() );
-
- $term->setText( $raw );
- $this->assertEquals( $normalized, $term->getNormalizedText() );
- }
-
public function testClone() {
$term = new Term( array(
'termText' => 'Foo'
diff --git a/repo/includes/store/sql/SqlStore.php
b/repo/includes/store/sql/SqlStore.php
index b29c3c2..5595d9e 100644
--- a/repo/includes/store/sql/SqlStore.php
+++ b/repo/includes/store/sql/SqlStore.php
@@ -67,7 +67,10 @@
* @return TermIndex
*/
protected function newTermIndex() {
- return new TermSqlIndex( 'wb_terms' );
+ //TODO: Get $stringNormalizer from WikibaseRepo?
+ // Can't really pass this via the constructor...
+ $stringNormalizer = new StringNormalizer();
+ return new TermSqlIndex( $stringNormalizer );
}
/**
diff --git a/repo/includes/store/sql/TermSearchKeyBuilder.php
b/repo/includes/store/sql/TermSearchKeyBuilder.php
index f8fa085..afb4dc9 100644
--- a/repo/includes/store/sql/TermSearchKeyBuilder.php
+++ b/repo/includes/store/sql/TermSearchKeyBuilder.php
@@ -219,9 +219,9 @@
/**
* Updates a single row with a newley calculated search key.
- * The search key is calculated using Term::normalizeText().
+ * The search key is calculated using TermSqlIndex::getSearchKey().
*
- * @see Term::normalizeText
+ * @see TermSqlIndex::getSearchKey
*
* @since 0.4
*
@@ -233,7 +233,7 @@
* @return string the search key
*/
protected function updateSearchKey( \DatabaseBase $dbw, $rowId, $text,
$lang ) {
- $key = Term::normalizeText( $text, $lang );
+ $key = $this->table->getSearchKey( $text, $lang );
$dbw->update(
$this->table->getTableName(),
diff --git a/repo/tests/phpunit/includes/store/sql/TermSqlIndexTest.php
b/repo/tests/phpunit/includes/store/sql/TermSqlIndexTest.php
index 7c5cfd3..7aebbac 100644
--- a/repo/tests/phpunit/includes/store/sql/TermSqlIndexTest.php
+++ b/repo/tests/phpunit/includes/store/sql/TermSqlIndexTest.php
@@ -2,6 +2,7 @@
namespace Wikibase\Test;
use Wikibase\Item;
+use Wikibase\StringNormalizer;
use Wikibase\Term;
/**
@@ -39,7 +40,8 @@
class TermSqlIndexTest extends TermIndexTest {
public function getTermIndex() {
- return new \Wikibase\TermSqlIndex( 'wb_terms' );
+ $normalizer = new StringNormalizer();
+ return new \Wikibase\TermSqlIndex( $normalizer );
}
public function termProvider() {
@@ -69,7 +71,7 @@
/**
* @var \Wikibase\TermSqlIndex $termIndex
*/
- $termIndex = \Wikibase\StoreFactory::getStore( 'sqlstore'
)->getTermIndex();
+ $termIndex = $this->getTermIndex();
$termIndex->clear();
@@ -98,4 +100,63 @@
$this->assertEquals( $termText,
$obtainedTerm->getText() );
}
}
+
+ public static function provideGetSearchKey() {
+ return array(
+ array( // #0
+ 'foo', // raw
+ 'en', // lang
+ 'foo', // normalized
+ ),
+
+ array( // #1
+ ' foo ', // raw
+ 'en', // lang
+ 'foo', // normalized
+ ),
+
+ array( // #2: lower case of non-ascii character
+ 'ÄpFEl', // raw
+ 'de', // lang
+ 'äpfel', // normalized
+ ),
+
+ array( // #3: lower case of decomposed character
+ "A\xCC\x88pfel", // raw
+ 'de', // lang
+ 'äpfel', // normalized
+ ),
+
+ array( // #4: lower case of cyrillic character
+ 'Берлин', // raw
+ 'ru', // lang
+ 'берлин', // normalized
+ ),
+
+ array( // #5: lower case of greek character
+ 'Τάχιστη', // raw
+ 'he', // lang
+ 'τάχιστη', // normalized
+ ),
+
+ array( // #6: nasty unicode whitespace
+ // ZWNJ: U+200C \xE2\x80\x8C
+ // RTLM: U+200F \xE2\x80\x8F
+ // PSEP: U+2029 \xE2\x80\xA9
+
"\xE2\x80\x8F\xE2\x80\x8Cfoo\xE2\x80\x8Cbar\xE2\x80\xA9", // raw
+ 'en', // lang
+ "foo bar", // normalized
+ ),
+ );
+ }
+
+ /**
+ * @dataProvider provideGetSearchKey
+ */
+ public function testGetSearchKey( $raw, $lang, $normalized ) {
+ $index = $this->getTermIndex();
+
+ $key = $index->getSearchKey( $raw, $lang );
+ $this->assertEquals( $normalized, $key );
+ }
}
--
To view, visit https://gerrit.wikimedia.org/r/72079
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I8fd9eba0999b1bfcb69dba0bd84723e491f1e012
Gerrit-PatchSet: 5
Gerrit-Project: mediawiki/extensions/Wikibase
Gerrit-Branch: master
Gerrit-Owner: Daniel Kinzler <[email protected]>
Gerrit-Reviewer: Aude <[email protected]>
Gerrit-Reviewer: Daniel Werner <[email protected]>
Gerrit-Reviewer: Jeroen De Dauw <[email protected]>
Gerrit-Reviewer: jenkins-bot
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits